This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 372bcd2840 Add a quickstart for null handling related queries (#14182)
372bcd2840 is described below
commit 372bcd2840f8403d1b76648cce1fbc450cafdea2
Author: Yash Mayya <[email protected]>
AuthorDate: Thu Oct 10 03:33:36 2024 +0530
Add a quickstart for null handling related queries (#14182)
---
.../apache/pinot/tools/NullHandlingQuickstart.java | 107 +++++++++++++++++
.../clientSalaryNulls_offline_table_config.json | 18 +++
.../clientSalaryNulls_schema.json | 38 ++++++
.../batch/clientSalaryNulls/ingestionJobSpec.yaml | 129 +++++++++++++++++++++
.../rawdata/clientSalaryNulls_data.avro | Bin 0 -> 3192 bytes
5 files changed, 292 insertions(+)
diff --git
a/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java
b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java
new file mode 100644
index 0000000000..7f0a8a727b
--- /dev/null
+++
b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.tools;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import org.apache.pinot.spi.utils.CommonConstants;
+import org.apache.pinot.tools.admin.PinotAdministrator;
+import org.apache.pinot.tools.admin.command.QuickstartRunner;
+
+
+/**
+ * Quickstart with a table that has some null values in order to be able to
play around with Pinot's null handling
+ * related features.
+ */
+public class NullHandlingQuickstart extends Quickstart {
+
+ private static final String[] NULL_HANDLING_TABLE_DIRS = new
String[]{"examples/batch/clientSalaryNulls"};
+
+ @Override
+ public List<String> types() {
+ return Collections.singletonList("NULL_HANDLING");
+ }
+
+ @Override
+ public String[] getDefaultBatchTableDirectories() {
+ return NULL_HANDLING_TABLE_DIRS;
+ }
+
+ @Override
+ public void runSampleQueries(QuickstartRunner runner)
+ throws Exception {
+ printStatus(Quickstart.Color.YELLOW, "***** Null handling quickstart setup
complete *****");
+
+ Map<String, String> queryOptions = Collections.singletonMap("queryOptions",
+ CommonConstants.Broker.Request.QueryOptionKey.ENABLE_NULL_HANDLING +
"=true");
+
+ printStatus(Quickstart.Color.YELLOW, "Total number of documents in the
table");
+ String query = "SELECT COUNT(*) FROM clientSalaryNulls";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Total number of documents in the
table with null salary values");
+ query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary IS NULL";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Total number of documents in the
table with non-null description");
+ query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE description IS NOT
NULL";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Minimum salary with null handling
enabled");
+ query = "SELECT MIN(salary) FROM clientSalaryNulls";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Minimum salary without null handling
enabled");
+ query = "SELECT MIN(salary) FROM clientSalaryNulls";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Count where salary is less than
80000");
+ query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+
+ printStatus(Quickstart.Color.YELLOW, "Count where salary is less than
80000 (without null handling enabled)");
+ query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000";
+ printStatus(Quickstart.Color.CYAN, "Query : " + query);
+ printStatus(Quickstart.Color.YELLOW,
prettyPrintResponse(runner.runQuery(query)));
+ printStatus(Quickstart.Color.GREEN,
"***************************************************");
+ }
+
+ public static void main(String[] args)
+ throws Exception {
+ List<String> arguments = new ArrayList<>();
+ arguments.addAll(Arrays.asList("QuickStart", "-type", "NULL_HANDLING"));
+ arguments.addAll(Arrays.asList(args));
+ PinotAdministrator.main(arguments.toArray(new String[0]));
+ }
+}
diff --git
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
new file mode 100644
index 0000000000..08a322eb73
--- /dev/null
+++
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
@@ -0,0 +1,18 @@
+{
+ "tableName": "clientSalaryNulls",
+ "segmentsConfig" : {
+ "replication" : "1",
+ "schemaName" : "clientSalaryNulls"
+ },
+ "tableIndexConfig" : {
+ "invertedIndexColumns" : [],
+ "loadMode" : "MMAP"
+ },
+ "tenants" : {
+ "broker":"DefaultTenant",
+ "server":"DefaultTenant"
+ },
+ "tableType":"OFFLINE",
+ "metadata": {}
+}
+
diff --git
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
new file mode 100644
index 0000000000..c69ae185c9
--- /dev/null
+++
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
@@ -0,0 +1,38 @@
+{
+ "dimensionFieldSpecs": [
+ {
+ "dataType": "INT",
+ "singleValueField": true,
+ "name": "clientId",
+ "notNull": "true"
+ },
+ {
+ "dataType": "STRING",
+ "singleValueField": true,
+ "name": "city",
+ "notNull": "true"
+ },
+ {
+ "dataType": "STRING",
+ "singleValueField": true,
+ "name": "description",
+ "notNull": "false"
+ },
+ {
+ "dataType": "INT",
+ "singleValueField": true,
+ "name": "salary",
+ "notNull": "false"
+ }
+ ],
+ "dateTimeFieldSpecs": [
+ {
+ "name": "DaysSinceEpoch",
+ "dataType": "INT",
+ "format": "1:DAYS:EPOCH",
+ "granularity": "1:DAYS"
+ }
+ ],
+ "schemaName": "clientSalaryNulls",
+ "enableColumnBasedNullHandling": true
+}
diff --git
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
new file mode 100644
index 0000000000..4bdd5519fd
--- /dev/null
+++
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# executionFrameworkSpec: Defines ingestion jobs to be running.
+executionFrameworkSpec:
+
+ # name: execution framework name
+ name: 'standalone'
+
+ # Class to use for segment generation and different push types.
+ segmentGenerationJobRunnerClassName:
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentGenerationJobRunner'
+ segmentTarPushJobRunnerClassName:
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentTarPushJobRunner'
+ segmentUriPushJobRunnerClassName:
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentUriPushJobRunner'
+ segmentMetadataPushJobRunnerClassName:
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentMetadataPushJobRunner'
+
+# jobType: Pinot ingestion job type.
+# Supported job types are defined in PinotIngestionJobType class.
+# 'SegmentCreation'
+# 'SegmentTarPush'
+# 'SegmentUriPush'
+# 'SegmentMetadataPush'
+# 'SegmentCreationAndTarPush'
+# 'SegmentCreationAndUriPush'
+# 'SegmentCreationAndMetadataPush'
+jobType: SegmentCreationAndTarPush
+
+# inputDirURI: Root directory of input data, expected to have scheme
configured in PinotFS.
+inputDirURI: 'examples/batch/clientSalaryNulls/rawdata'
+
+# includeFileNamePattern: include file name pattern, supported glob pattern.
+# Sample usage:
+# 'glob:*.avro' will include all avro files just under the inputDirURI, not
sub directories;
+# 'glob:**/*.avro' will include all the avro files under inputDirURI
recursively.
+includeFileNamePattern: 'glob:**/*.avro'
+
+# excludeFileNamePattern: exclude file name pattern, supported glob pattern.
+# Sample usage:
+# 'glob:*.avro' will exclude all avro files just under the inputDirURI, not
sub directories;
+# 'glob:**/*.avro' will exclude all the avro files under inputDirURI
recursively.
+# _excludeFileNamePattern: ''
+
+# outputDirURI: Root directory of output segments, expected to have scheme
configured in PinotFS.
+outputDirURI: 'examples/batch/clientSalaryNulls/segments'
+
+# overwriteOutput: Overwrite output segments if existed.
+overwriteOutput: true
+
+# pinotFSSpecs: defines all related Pinot file systems.
+pinotFSSpecs:
+
+ - # scheme: used to identify a PinotFS.
+ # E.g. local, hdfs, dbfs, etc
+ scheme: file
+
+ # className: Class name used to create the PinotFS instance.
+ # E.g.
+ # org.apache.pinot.spi.filesystem.LocalPinotFS is used for local
filesystem
+ # org.apache.pinot.plugin.filesystem.AzurePinotFS is used for Azure Data
Lake
+ # org.apache.pinot.plugin.filesystem.HadoopPinotFS is used for HDFS
+ className: org.apache.pinot.spi.filesystem.LocalPinotFS
+
+# recordReaderSpec: defines all record reader
+recordReaderSpec:
+
+ # dataFormat: Record data format, e.g. 'avro', 'parquet', 'orc', 'csv',
'json', 'thrift' etc.
+ dataFormat: 'avro'
+
+ # className: Corresponding RecordReader class name.
+ # E.g.
+ # org.apache.pinot.plugin.inputformat.avro.AvroRecordReader
+ # org.apache.pinot.plugin.inputformat.csv.CSVRecordReader
+ # org.apache.pinot.plugin.inputformat.parquet.ParquetRecordReader
+ # org.apache.pinot.plugin.inputformat.parquet.ParquetNativeRecordReader
+ # org.apache.pinot.plugin.inputformat.json.JSONRecordReader
+ # org.apache.pinot.plugin.inputformat.orc.ORCRecordReader
+ # org.apache.pinot.plugin.inputformat.thrift.ThriftRecordReader
+ className: 'org.apache.pinot.plugin.inputformat.avro.AvroRecordReader'
+
+# tableSpec: defines table name and where to fetch corresponding table config
and table schema.
+tableSpec:
+
+ # tableName: Table name
+ tableName: 'clientSalaryNulls'
+
+ # schemaURI: defines where to read the table schema, supports PinotFS or
HTTP.
+ # E.g.
+ # hdfs://path/to/table_schema.json
+ # http://localhost:9000/tables/myTable/schema
+ schemaURI: 'http://localhost:9000/tables/clientSalaryNulls/schema'
+
+ # tableConfigURI: defines where to reade the table config.
+ # Supports using PinotFS or HTTP.
+ # E.g.
+ # hdfs://path/to/table_config.json
+ # http://localhost:9000/tables/myTable
+ # Note that the API to read Pinot table config directly from pinot
controller contains a JSON wrapper.
+ # The real table config is the object under the field 'OFFLINE'.
+ tableConfigURI: 'http://localhost:9000/tables/clientSalaryNulls'
+
+# pinotClusterSpecs: defines the Pinot Cluster Access Point.
+pinotClusterSpecs:
+ - # controllerURI: used to fetch table/schema information and data push.
+ # E.g. http://localhost:9000
+ controllerURI: 'http://localhost:9000'
+
+# pushJobSpec: defines segment push job related configuration.
+pushJobSpec:
+
+ # pushAttempts: number of attempts for push job, default is 1, which means
no retry.
+ pushAttempts: 2
+
+ # pushRetryIntervalMillis: retry wait Ms, default to 1 second.
+ pushRetryIntervalMillis: 1000
diff --git
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
new file mode 100644
index 0000000000..c7843d4738
Binary files /dev/null and
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
differ
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]