This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 9c12ef66cc9c4c54388d07b638fdc86548e392c2 Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Mon Jul 7 18:59:23 2025 +0200 IMPALA-14018: Adding utility scripts to run Lakekeeper in Impala dev envinroment This patch adds utility scripts to run Lakekeeper (an open source Iceberg REST Catalog) in Impala's dev environment. Lakekeeper's HDFS support is in preview phase, so we are using a preview docker image for now. IcebergRESTCatalog's config setup is also refactored, and now we don't always set "credentials" in the SessionContext, only if they are provided. Usage To start Lakekeeper: testdata/bin/run-lakekeeper.sh To stop Lakekeeper: testdata/bin/stop-lakekeeper.sh Now you can create schemas and tables via Trino (need to rebuild the Trino image for this, TODO: use docker compose for this): docker stop impala-minicluster-trino docker rm impala-minicluster-trino ./testdata/bin/build-trino-docker-image.sh ./testdata/bin/run-trino.sh Then via Trino CLI: testdata/bin/trino-cli.sh show catalogs; create schema iceberg_lakekeeper.trino_db; use iceberg_lakekeeper.trino_db; create table trino_t (i int); insert into trino_t values (35); After this, you should be able to query the table via Impala: mkdir /tmp/iceberg_lakekeeper cp testdata/bin/minicluster_trino/iceberg_lakekeeper.properties /tmp/iceberg_lakekeeper bin/start-impala-cluster.py --no_catalogd \ --impalad_args="--catalogd_deployed=false --use_local_catalog=true \ --catalog_config_dir=/tmp/iceberg_lakekeeper/" bin/impala-shell.sh Change-Id: I610f5859f92b2ff82e310f46356e3f118e986b2c Reviewed-on: http://gerrit.cloudera.org:8080/23141 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- bin/rat_exclude_files.txt | 1 + .../impala/catalog/iceberg/IcebergRESTCatalog.java | 95 ++++++++++----- testdata/bin/minicluster_lakekeeper/README.md | 49 ++++++++ .../create-default-warehouse.json | 13 ++ .../bin/minicluster_lakekeeper/docker-compose.yaml | 131 +++++++++++++++++++++ testdata/bin/minicluster_trino/Dockerfile | 2 +- .../{Dockerfile => iceberg_lakekeeper.properties} | 22 +--- .../Dockerfile => run-lakekeeper.sh} | 22 ++-- .../Dockerfile => stop-lakekeeper.sh} | 19 +-- 9 files changed, 278 insertions(+), 76 deletions(-) diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt index c9652ed7b..45efbf7c3 100644 --- a/bin/rat_exclude_files.txt +++ b/bin/rat_exclude_files.txt @@ -103,6 +103,7 @@ README*.md */README.dox */README.txt testdata/bin/README-BENCHMARK-TEST-GENERATION +testdata/bin/minicluster_lakekeeper/README.md testdata/scale_test_metadata/README.md tests/comparison/ORACLE.txt bin/distcc/README.md diff --git a/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java b/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java index 5bfa7a51c..61e1e0529 100644 --- a/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java +++ b/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java @@ -63,43 +63,78 @@ public class IcebergRESTCatalog implements IcebergCatalog { return instance_; } + private static class IcebergRestConfig { + String catalogName; + String uri; + String user; + String secret; + String credential; + String warehouseLocation; + + IcebergRestConfig(Properties properties) { + uri = getRequiredProperty(properties, KEY_URI); + catalogName = properties.getProperty(KEY_NAME, ""); + user = properties.getProperty(KEY_CLIENT_ID); + secret = properties.getProperty(KEY_CLIENT_SECRET); + credential = getCredential(); + warehouseLocation = properties.getProperty(KEY_WAREHOUSE); + } + + public Map<String, String> getCatalogProperties() { + ImmutableMap.Builder<String, String> mapBuilder = new ImmutableMap.Builder<>(); + mapBuilder.put(CatalogProperties.URI, uri); + if (credential != null) mapBuilder.put("credential", credential); + if (warehouseLocation != null){ + mapBuilder.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation); + } + return mapBuilder.build(); + } + + public SessionCatalog.SessionContext getSessionContext() { + return new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), + user, + getCredentialMap(), + ImmutableMap.of()); + } + + private String getRequiredProperty(Properties properties, String key) { + String value = properties.getProperty(key); + if (value == null) { + throw new IllegalStateException( + String.format("Missing property of IcebergRESTCatalog: %s", key)); + } + return value; + } + + private String getCredential() { + if (user != null && secret != null) { + return user + ":" + secret; + } + return null; + } + + private ImmutableMap<String, String> getCredentialMap() { + ImmutableMap.Builder<String, String> mapBuilder = new ImmutableMap.Builder<>(); + if (credential != null) { + mapBuilder.put("credential", credential); + } + return mapBuilder.build(); + } + } + private IcebergRESTCatalog(Properties properties) { setContextClassLoader(); - REST_URI = getRequiredProperty(properties, KEY_URI); - final String CATALOG_NAME = properties.getProperty(KEY_NAME, ""); - final String CLIENT_ID = properties.getProperty(KEY_CLIENT_ID, "impala"); - final String CLIENT_SECRET = properties.getProperty(KEY_CLIENT_SECRET, ""); - final String CLIENT_CREDS = CLIENT_ID + ":" + CLIENT_SECRET; - final String WAREHOUSE_LOCATION = properties.getProperty(KEY_WAREHOUSE, ""); - - SessionCatalog.SessionContext context = - new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), - "user", - ImmutableMap.of("credential", CLIENT_CREDS), - ImmutableMap.of()); - - restCatalog_ = new RESTCatalog(context, + IcebergRestConfig restConfig = new IcebergRestConfig(properties); + REST_URI = restConfig.uri; + restCatalog_ = new RESTCatalog(restConfig.getSessionContext(), (config) -> HTTPClient.builder(config).uri(REST_URI).build()); HiveConf conf = new HiveConf(IcebergRESTCatalog.class); restCatalog_.setConf(conf); restCatalog_.initialize( - CATALOG_NAME, - ImmutableMap.of( - CatalogProperties.URI, REST_URI, - "credential", CLIENT_CREDS, - CatalogProperties.WAREHOUSE_LOCATION, WAREHOUSE_LOCATION) - ); - } - - private String getRequiredProperty(Properties properties, String key) { - String value = properties.getProperty(key); - if (value == null) { - throw new IllegalStateException( - String.format("Missing property of IcebergRESTCatalog: %s", key)); - } - return value; + restConfig.catalogName, + restConfig.getCatalogProperties()); } public String getUri() { diff --git a/testdata/bin/minicluster_lakekeeper/README.md b/testdata/bin/minicluster_lakekeeper/README.md new file mode 100644 index 000000000..b1e5b2df8 --- /dev/null +++ b/testdata/bin/minicluster_lakekeeper/README.md @@ -0,0 +1,49 @@ +## About Lakekeeper +Lakekeeper is an Apache-Licensed implementation of the Apache Iceberg REST Catalog specification. See more at https://github.com/lakekeeper/lakekeeper + +## Prerequisites to use Lakekeeper +You need docker compose (Compose V2) in your environment. This usually means you just need a recent docker version. Sometimes you need to install the docker compose plugin. + +## Run and stop Lakekeeper in dev environment +Via the following scripts you can run/stop Lakekeeper. Be aware that each restart resets the warehouse contents. +``` +${IMPALA_HOME}/testdata/bin/run-lakekeeper.sh +${IMPALA_HOME}/testdata/bin/stop-lakekeeper.sh +``` + +## Ingesting data +Until Impala can write Iceberg tables in the REST Catalog, you can use Trino to create tables. +Let's rebuild our Trino image for this: +``` +docker stop impala-minicluster-trino +docker rm impala-minicluster-trino +./testdata/bin/build-trino-docker-image.sh +./testdata/bin/run-trino.sh +``` + +Let's connect to Trino via its CLI: +``` +testdata/bin/trino-cli.sh +``` + +Now we can execute the following commands: +``` +show catalogs; +create schema iceberg_lakekeeper.trino_db; +use iceberg_lakekeeper.trino_db; +create table trino_t (i int); +insert into trino_t values (35); +``` + +## Query via Impala +After this, you should be able to query the table via Impala: +``` +mkdir /tmp/iceberg_lakekeeper +cp testdata/bin/minicluster_trino/iceberg_lakekeeper.properties /tmp/iceberg_lakekeeper + +bin/start-impala-cluster.py --no_catalogd \ + --impalad_args="--catalogd_deployed=false --use_local_catalog=true \ + --catalog_config_dir=/tmp/iceberg_lakekeeper/" + +bin/impala-shell.sh +``` diff --git a/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json b/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json new file mode 100644 index 000000000..f969196a9 --- /dev/null +++ b/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json @@ -0,0 +1,13 @@ +{ + "warehouse-name": "lakekeeper_demo", + "project-id": "00000000-0000-0000-0000-000000000000", + "storage-profile": { + "type": "hdfs", + "url": "hdfs://localhost:20500", + "key-prefix": "/test-warehouse/lakekeeper", + "config": { + "dfs.client.read.shortcircuit": "true", + "dfs.domain.socket.path": "/var/lib/hadoop-hdfs/dn_socket" + } + } +} diff --git a/testdata/bin/minicluster_lakekeeper/docker-compose.yaml b/testdata/bin/minicluster_lakekeeper/docker-compose.yaml new file mode 100644 index 000000000..1e1c6ba25 --- /dev/null +++ b/testdata/bin/minicluster_lakekeeper/docker-compose.yaml @@ -0,0 +1,131 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +services: + lakekeeper: + image: ${LAKEKEEPER_TEST__SERVER_IMAGE:-quay.io/lakekeeper/catalog:v0.8.3-hdfs-preview} + pull_policy: always + environment: + - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure! + - LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:54321/postgres + - LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:54321/postgres + - LAKEKEEPER__ENABLE_HDFS_WITH_SYSTEM_CREDENTIALS=true + - RUST_LOG=trace,axum=trace,sqlx=trace,iceberg-catalog=trace + - HADOOP_USER_NAME=${USER} + - HADOOP_CONF_DIR=/etc/hadoop + - RUST_LOG=debug,hdfs_native=trace + command: [ "serve" ] + healthcheck: + test: [ "CMD", "/home/nonroot/iceberg-catalog", "healthcheck" ] + interval: 1s + timeout: 10s + retries: 3 + start_period: 3s + depends_on: + migrate: + condition: service_completed_successfully + db: + condition: service_healthy + volumes: + - ./core-site.xml:/etc/hadoop/core-site.xml:ro + - ./hdfs-site.xml:/etc/hadoop/hdfs-site.xml:ro + network_mode: host + + migrate: + image: ${LAKEKEEPER_TEST__SERVER_IMAGE:-quay.io/lakekeeper/catalog:v0.8.3-hdfs-preview} + pull_policy: always + environment: + - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure! + - LAKEKEEPER__ENABLE_HDFS_WITH_SYSTEM_CREDENTIALS=true + - LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@db:5432/postgres + - LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@db:5432/postgres + - RUST_LOG=info + restart: "no" + command: [ "migrate" ] + depends_on: + db: + condition: service_healthy + networks: + lakekeeper_net: + + bootstrap: + image: curlimages/curl + depends_on: + lakekeeper: + condition: service_healthy + restart: "no" + command: + - -w + - "%{http_code}" + - "-X" + - "POST" + - "-v" + - "http://localhost:8181/management/v1/bootstrap" + - "-H" + - "Content-Type: application/json" + - "--data" + - '{"accept-terms-of-use": true}' + - "-o" + - "/dev/null" + # - "--fail-with-body" + network_mode: host + + initialwarehouse: + image: curlimages/curl + depends_on: + lakekeeper: + condition: service_healthy + bootstrap: + condition: service_completed_successfully + restart: "no" + command: + - -w + - "%{http_code}" + - "-X" + - "POST" + - "-v" + - "http://localhost:8181/management/v1/warehouse" + - "-H" + - "Content-Type: application/json" + - "--data" + - "@create-default-warehouse.json" + - "-o" + - "/dev/null" + volumes: + - ./create-default-warehouse.json:/home/curl_user/create-default-warehouse.json + network_mode: host + + db: + image: bitnami/postgresql:16.3.0 + environment: + - POSTGRESQL_USERNAME=postgres + - POSTGRESQL_PASSWORD=postgres + - POSTGRESQL_DATABASE=postgres + healthcheck: + test: [ "CMD-SHELL", "pg_isready -U postgres -p 5432 -d postgres" ] + interval: 2s + timeout: 10s + retries: 2 + start_period: 10s + networks: + lakekeeper_net: + ports: + - "54321:5432" + +networks: + lakekeeper_net: + diff --git a/testdata/bin/minicluster_trino/Dockerfile b/testdata/bin/minicluster_trino/Dockerfile index 041f03b34..c8ffa2186 100644 --- a/testdata/bin/minicluster_trino/Dockerfile +++ b/testdata/bin/minicluster_trino/Dockerfile @@ -27,7 +27,7 @@ RUN \ echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config COPY hive-site.xml core-site.xml hdfs-site.xml /etc/ -COPY iceberg_rest.properties iceberg.properties hive.properties /etc/trino/catalog/ +COPY iceberg_lakekeeper.properties iceberg_rest.properties iceberg.properties hive.properties /etc/trino/catalog/ # Expose the Trino port EXPOSE 9091 diff --git a/testdata/bin/minicluster_trino/Dockerfile b/testdata/bin/minicluster_trino/iceberg_lakekeeper.properties similarity index 57% copy from testdata/bin/minicluster_trino/Dockerfile copy to testdata/bin/minicluster_trino/iceberg_lakekeeper.properties index 041f03b34..c72d0abfa 100644 --- a/testdata/bin/minicluster_trino/Dockerfile +++ b/testdata/bin/minicluster_trino/iceberg_lakekeeper.properties @@ -15,19 +15,9 @@ # specific language governing permissions and limitations # under the License. -# Use an official Trino image as the base -FROM trinodb/trino:latest - -# Use the developer username, so Trino will have write access to HDFS -ARG USERNAME - -RUN \ - sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' /etc/trino/config.properties && \ - sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \ - echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config - -COPY hive-site.xml core-site.xml hdfs-site.xml /etc/ -COPY iceberg_rest.properties iceberg.properties hive.properties /etc/trino/catalog/ - -# Expose the Trino port -EXPOSE 9091 +connector.name=iceberg +iceberg.catalog.type=rest +iceberg.rest-catalog.uri=http://localhost:8181/catalog +iceberg.rest-catalog.warehouse=lakekeeper_demo +fs.hadoop.enabled=true +hive.config.resources=/etc/hive-site.xml,/etc/hdfs-site.xml,/etc/core-site.xml diff --git a/testdata/bin/minicluster_trino/Dockerfile b/testdata/bin/run-lakekeeper.sh old mode 100644 new mode 100755 similarity index 57% copy from testdata/bin/minicluster_trino/Dockerfile copy to testdata/bin/run-lakekeeper.sh index 041f03b34..559ba319b --- a/testdata/bin/minicluster_trino/Dockerfile +++ b/testdata/bin/run-lakekeeper.sh @@ -1,3 +1,5 @@ +#!/bin/bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,19 +17,11 @@ # specific language governing permissions and limitations # under the License. -# Use an official Trino image as the base -FROM trinodb/trino:latest - -# Use the developer username, so Trino will have write access to HDFS -ARG USERNAME - -RUN \ - sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' /etc/trino/config.properties && \ - sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \ - echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config +# Copy cluster configs to trino docker directory. +pushd ${HADOOP_CONF_DIR} +cp core-site.xml hdfs-site.xml ${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper +popd -COPY hive-site.xml core-site.xml hdfs-site.xml /etc/ -COPY iceberg_rest.properties iceberg.properties hive.properties /etc/trino/catalog/ +cd ${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper -# Expose the Trino port -EXPOSE 9091 +docker compose up -d diff --git a/testdata/bin/minicluster_trino/Dockerfile b/testdata/bin/stop-lakekeeper.sh old mode 100644 new mode 100755 similarity index 57% copy from testdata/bin/minicluster_trino/Dockerfile copy to testdata/bin/stop-lakekeeper.sh index 041f03b34..ca8adad8e --- a/testdata/bin/minicluster_trino/Dockerfile +++ b/testdata/bin/stop-lakekeeper.sh @@ -1,3 +1,5 @@ +#!/bin/bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,19 +17,6 @@ # specific language governing permissions and limitations # under the License. -# Use an official Trino image as the base -FROM trinodb/trino:latest - -# Use the developer username, so Trino will have write access to HDFS -ARG USERNAME - -RUN \ - sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' /etc/trino/config.properties && \ - sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \ - echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config - -COPY hive-site.xml core-site.xml hdfs-site.xml /etc/ -COPY iceberg_rest.properties iceberg.properties hive.properties /etc/trino/catalog/ +cd ${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper -# Expose the Trino port -EXPOSE 9091 +docker compose down
