This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 9c12ef66cc9c4c54388d07b638fdc86548e392c2
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Mon Jul 7 18:59:23 2025 +0200

    IMPALA-14018: Adding utility scripts to run Lakekeeper in Impala dev 
envinroment
    
    This patch adds utility scripts to run Lakekeeper (an open source
    Iceberg REST Catalog) in Impala's dev environment. Lakekeeper's HDFS
    support is in preview phase, so we are using a preview docker image
    for now.
    
    IcebergRESTCatalog's config setup is also refactored, and now we don't
    always set "credentials" in the SessionContext, only if they are
    provided.
    
    Usage
    
    To start Lakekeeper:
    testdata/bin/run-lakekeeper.sh
    
    To stop Lakekeeper:
    testdata/bin/stop-lakekeeper.sh
    
    Now you can create schemas and tables via Trino (need to rebuild the
    Trino image for this, TODO: use docker compose for this):
    
    docker stop impala-minicluster-trino
    docker rm impala-minicluster-trino
    ./testdata/bin/build-trino-docker-image.sh
    ./testdata/bin/run-trino.sh
    
    Then via Trino CLI:
    testdata/bin/trino-cli.sh
    
    show catalogs;
    create schema iceberg_lakekeeper.trino_db;
    use iceberg_lakekeeper.trino_db;
    create table trino_t (i int);
    insert into trino_t values (35);
    
    After this, you should be able to query the table via Impala:
    
    mkdir /tmp/iceberg_lakekeeper
    cp testdata/bin/minicluster_trino/iceberg_lakekeeper.properties 
/tmp/iceberg_lakekeeper
    
    bin/start-impala-cluster.py --no_catalogd \
        --impalad_args="--catalogd_deployed=false --use_local_catalog=true \
        --catalog_config_dir=/tmp/iceberg_lakekeeper/"
    
    bin/impala-shell.sh
    
    Change-Id: I610f5859f92b2ff82e310f46356e3f118e986b2c
    Reviewed-on: http://gerrit.cloudera.org:8080/23141
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 bin/rat_exclude_files.txt                          |   1 +
 .../impala/catalog/iceberg/IcebergRESTCatalog.java |  95 ++++++++++-----
 testdata/bin/minicluster_lakekeeper/README.md      |  49 ++++++++
 .../create-default-warehouse.json                  |  13 ++
 .../bin/minicluster_lakekeeper/docker-compose.yaml | 131 +++++++++++++++++++++
 testdata/bin/minicluster_trino/Dockerfile          |   2 +-
 .../{Dockerfile => iceberg_lakekeeper.properties}  |  22 +---
 .../Dockerfile => run-lakekeeper.sh}               |  22 ++--
 .../Dockerfile => stop-lakekeeper.sh}              |  19 +--
 9 files changed, 278 insertions(+), 76 deletions(-)

diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index c9652ed7b..45efbf7c3 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -103,6 +103,7 @@ README*.md
 */README.dox
 */README.txt
 testdata/bin/README-BENCHMARK-TEST-GENERATION
+testdata/bin/minicluster_lakekeeper/README.md
 testdata/scale_test_metadata/README.md
 tests/comparison/ORACLE.txt
 bin/distcc/README.md
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java 
b/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java
index 5bfa7a51c..61e1e0529 100644
--- a/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java
+++ b/fe/src/main/java/org/apache/impala/catalog/iceberg/IcebergRESTCatalog.java
@@ -63,43 +63,78 @@ public class IcebergRESTCatalog implements IcebergCatalog {
     return instance_;
   }
 
+  private static class IcebergRestConfig {
+    String catalogName;
+    String uri;
+    String user;
+    String secret;
+    String credential;
+    String warehouseLocation;
+
+    IcebergRestConfig(Properties properties) {
+      uri = getRequiredProperty(properties, KEY_URI);
+      catalogName = properties.getProperty(KEY_NAME, "");
+      user = properties.getProperty(KEY_CLIENT_ID);
+      secret = properties.getProperty(KEY_CLIENT_SECRET);
+      credential = getCredential();
+      warehouseLocation = properties.getProperty(KEY_WAREHOUSE);
+    }
+
+    public Map<String, String> getCatalogProperties() {
+      ImmutableMap.Builder<String, String> mapBuilder = new 
ImmutableMap.Builder<>();
+      mapBuilder.put(CatalogProperties.URI, uri);
+      if (credential != null) mapBuilder.put("credential", credential);
+      if (warehouseLocation != null){
+        mapBuilder.put(CatalogProperties.WAREHOUSE_LOCATION, 
warehouseLocation);
+      }
+      return mapBuilder.build();
+    }
+
+    public SessionCatalog.SessionContext getSessionContext() {
+      return new SessionCatalog.SessionContext(
+          UUID.randomUUID().toString(),
+          user,
+          getCredentialMap(),
+          ImmutableMap.of());
+    }
+
+    private String getRequiredProperty(Properties properties, String key) {
+      String value = properties.getProperty(key);
+      if (value == null) {
+        throw new IllegalStateException(
+            String.format("Missing property of IcebergRESTCatalog: %s", key));
+      }
+      return value;
+    }
+
+    private String getCredential() {
+      if (user != null && secret != null) {
+        return user + ":" + secret;
+      }
+      return null;
+    }
+
+    private ImmutableMap<String, String> getCredentialMap() {
+      ImmutableMap.Builder<String, String> mapBuilder = new 
ImmutableMap.Builder<>();
+      if (credential != null) {
+        mapBuilder.put("credential", credential);
+      }
+      return mapBuilder.build();
+    }
+  }
+
   private IcebergRESTCatalog(Properties properties) {
     setContextClassLoader();
 
-    REST_URI = getRequiredProperty(properties, KEY_URI);
-    final String CATALOG_NAME = properties.getProperty(KEY_NAME, "");
-    final String CLIENT_ID = properties.getProperty(KEY_CLIENT_ID, "impala");
-    final String CLIENT_SECRET = properties.getProperty(KEY_CLIENT_SECRET, "");
-    final String CLIENT_CREDS = CLIENT_ID + ":" + CLIENT_SECRET;
-    final String WAREHOUSE_LOCATION = properties.getProperty(KEY_WAREHOUSE, 
"");
-
-    SessionCatalog.SessionContext context =
-        new SessionCatalog.SessionContext(
-            UUID.randomUUID().toString(),
-            "user",
-            ImmutableMap.of("credential", CLIENT_CREDS),
-            ImmutableMap.of());
-
-    restCatalog_ = new RESTCatalog(context,
+    IcebergRestConfig restConfig = new IcebergRestConfig(properties);
+    REST_URI = restConfig.uri;
+    restCatalog_ = new RESTCatalog(restConfig.getSessionContext(),
         (config) -> HTTPClient.builder(config).uri(REST_URI).build());
     HiveConf conf = new HiveConf(IcebergRESTCatalog.class);
     restCatalog_.setConf(conf);
     restCatalog_.initialize(
-        CATALOG_NAME,
-        ImmutableMap.of(
-            CatalogProperties.URI, REST_URI,
-            "credential", CLIENT_CREDS,
-            CatalogProperties.WAREHOUSE_LOCATION, WAREHOUSE_LOCATION)
-    );
-  }
-
-  private String getRequiredProperty(Properties properties, String key) {
-    String value = properties.getProperty(key);
-    if (value == null) {
-      throw new IllegalStateException(
-          String.format("Missing property of IcebergRESTCatalog: %s", key));
-    }
-    return value;
+        restConfig.catalogName,
+        restConfig.getCatalogProperties());
   }
 
   public String getUri() {
diff --git a/testdata/bin/minicluster_lakekeeper/README.md 
b/testdata/bin/minicluster_lakekeeper/README.md
new file mode 100644
index 000000000..b1e5b2df8
--- /dev/null
+++ b/testdata/bin/minicluster_lakekeeper/README.md
@@ -0,0 +1,49 @@
+## About Lakekeeper
+Lakekeeper is an Apache-Licensed implementation of the Apache Iceberg REST 
Catalog specification. See more at https://github.com/lakekeeper/lakekeeper
+
+## Prerequisites to use Lakekeeper
+You need docker compose (Compose V2) in your environment. This usually means 
you just need a recent docker version. Sometimes you need to install the docker 
compose plugin.
+
+## Run and stop Lakekeeper in dev environment
+Via the following scripts you can run/stop Lakekeeper. Be aware that each 
restart resets the warehouse contents.
+```
+${IMPALA_HOME}/testdata/bin/run-lakekeeper.sh
+${IMPALA_HOME}/testdata/bin/stop-lakekeeper.sh
+```
+
+## Ingesting data
+Until Impala can write Iceberg tables in the REST Catalog, you can use Trino 
to create tables.
+Let's rebuild our Trino image for this:
+```
+docker stop impala-minicluster-trino
+docker rm impala-minicluster-trino
+./testdata/bin/build-trino-docker-image.sh
+./testdata/bin/run-trino.sh
+```
+
+Let's connect to Trino via its CLI:
+```
+testdata/bin/trino-cli.sh
+```
+
+Now we can execute the following commands:
+```
+show catalogs;
+create schema iceberg_lakekeeper.trino_db;
+use iceberg_lakekeeper.trino_db;
+create table trino_t (i int);
+insert into trino_t values (35);
+```
+
+## Query via Impala
+After this, you should be able to query the table via Impala:
+```
+mkdir /tmp/iceberg_lakekeeper
+cp testdata/bin/minicluster_trino/iceberg_lakekeeper.properties 
/tmp/iceberg_lakekeeper
+
+bin/start-impala-cluster.py --no_catalogd \
+    --impalad_args="--catalogd_deployed=false --use_local_catalog=true \
+    --catalog_config_dir=/tmp/iceberg_lakekeeper/"
+
+bin/impala-shell.sh
+```
diff --git a/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json 
b/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json
new file mode 100644
index 000000000..f969196a9
--- /dev/null
+++ b/testdata/bin/minicluster_lakekeeper/create-default-warehouse.json
@@ -0,0 +1,13 @@
+{
+  "warehouse-name": "lakekeeper_demo",
+  "project-id": "00000000-0000-0000-0000-000000000000",
+  "storage-profile": {
+    "type": "hdfs",
+    "url": "hdfs://localhost:20500",
+    "key-prefix": "/test-warehouse/lakekeeper",
+    "config": {
+      "dfs.client.read.shortcircuit": "true",
+      "dfs.domain.socket.path": "/var/lib/hadoop-hdfs/dn_socket"
+    }
+  }
+}
diff --git a/testdata/bin/minicluster_lakekeeper/docker-compose.yaml 
b/testdata/bin/minicluster_lakekeeper/docker-compose.yaml
new file mode 100644
index 000000000..1e1c6ba25
--- /dev/null
+++ b/testdata/bin/minicluster_lakekeeper/docker-compose.yaml
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+services:
+  lakekeeper:
+    image: 
${LAKEKEEPER_TEST__SERVER_IMAGE:-quay.io/lakekeeper/catalog:v0.8.3-hdfs-preview}
+    pull_policy: always
+    environment:
+      - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure!
+      - 
LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:54321/postgres
+      - 
LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:54321/postgres
+      - LAKEKEEPER__ENABLE_HDFS_WITH_SYSTEM_CREDENTIALS=true
+      - RUST_LOG=trace,axum=trace,sqlx=trace,iceberg-catalog=trace
+      - HADOOP_USER_NAME=${USER}
+      - HADOOP_CONF_DIR=/etc/hadoop
+      - RUST_LOG=debug,hdfs_native=trace
+    command: [ "serve" ]
+    healthcheck:
+      test: [ "CMD", "/home/nonroot/iceberg-catalog", "healthcheck" ]
+      interval: 1s
+      timeout: 10s
+      retries: 3
+      start_period: 3s
+    depends_on:
+      migrate:
+        condition: service_completed_successfully
+      db:
+        condition: service_healthy
+    volumes:
+      - ./core-site.xml:/etc/hadoop/core-site.xml:ro
+      - ./hdfs-site.xml:/etc/hadoop/hdfs-site.xml:ro
+    network_mode: host
+
+  migrate:
+    image: 
${LAKEKEEPER_TEST__SERVER_IMAGE:-quay.io/lakekeeper/catalog:v0.8.3-hdfs-preview}
+    pull_policy: always
+    environment:
+      - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure!
+      - LAKEKEEPER__ENABLE_HDFS_WITH_SYSTEM_CREDENTIALS=true
+      - 
LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@db:5432/postgres
+      - 
LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@db:5432/postgres
+      - RUST_LOG=info
+    restart: "no"
+    command: [ "migrate" ]
+    depends_on:
+      db:
+        condition: service_healthy
+    networks:
+      lakekeeper_net:
+
+  bootstrap:
+    image: curlimages/curl
+    depends_on:
+      lakekeeper:
+        condition: service_healthy
+    restart: "no"
+    command:
+      - -w
+      - "%{http_code}"
+      - "-X"
+      - "POST"
+      - "-v"
+      - "http://localhost:8181/management/v1/bootstrap";
+      - "-H"
+      - "Content-Type: application/json"
+      - "--data"
+      - '{"accept-terms-of-use": true}'
+      - "-o"
+      - "/dev/null"
+      # - "--fail-with-body"
+    network_mode: host
+
+  initialwarehouse:
+    image: curlimages/curl
+    depends_on:
+      lakekeeper:
+        condition: service_healthy
+      bootstrap:
+        condition: service_completed_successfully
+    restart: "no"
+    command:
+      - -w
+      - "%{http_code}"
+      - "-X"
+      - "POST"
+      - "-v"
+      - "http://localhost:8181/management/v1/warehouse";
+      - "-H"
+      - "Content-Type: application/json"
+      - "--data"
+      - "@create-default-warehouse.json"
+      - "-o"
+      - "/dev/null"
+    volumes:
+      - 
./create-default-warehouse.json:/home/curl_user/create-default-warehouse.json
+    network_mode: host
+
+  db:
+    image: bitnami/postgresql:16.3.0
+    environment:
+      - POSTGRESQL_USERNAME=postgres
+      - POSTGRESQL_PASSWORD=postgres
+      - POSTGRESQL_DATABASE=postgres
+    healthcheck:
+      test: [ "CMD-SHELL", "pg_isready -U postgres -p 5432 -d postgres" ]
+      interval: 2s
+      timeout: 10s
+      retries: 2
+      start_period: 10s
+    networks:
+      lakekeeper_net:
+    ports:
+      - "54321:5432"
+
+networks:
+  lakekeeper_net:
+
diff --git a/testdata/bin/minicluster_trino/Dockerfile 
b/testdata/bin/minicluster_trino/Dockerfile
index 041f03b34..c8ffa2186 100644
--- a/testdata/bin/minicluster_trino/Dockerfile
+++ b/testdata/bin/minicluster_trino/Dockerfile
@@ -27,7 +27,7 @@ RUN \
     echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config
 
 COPY hive-site.xml core-site.xml hdfs-site.xml /etc/
-COPY iceberg_rest.properties iceberg.properties hive.properties 
/etc/trino/catalog/
+COPY iceberg_lakekeeper.properties iceberg_rest.properties iceberg.properties 
hive.properties /etc/trino/catalog/
 
 # Expose the Trino port
 EXPOSE 9091
diff --git a/testdata/bin/minicluster_trino/Dockerfile 
b/testdata/bin/minicluster_trino/iceberg_lakekeeper.properties
similarity index 57%
copy from testdata/bin/minicluster_trino/Dockerfile
copy to testdata/bin/minicluster_trino/iceberg_lakekeeper.properties
index 041f03b34..c72d0abfa 100644
--- a/testdata/bin/minicluster_trino/Dockerfile
+++ b/testdata/bin/minicluster_trino/iceberg_lakekeeper.properties
@@ -15,19 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Use an official Trino image as the base
-FROM trinodb/trino:latest
-
-# Use the developer username, so Trino will have write access to HDFS
-ARG USERNAME
-
-RUN \
-    sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' 
/etc/trino/config.properties && \
-    sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \
-    echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config
-
-COPY hive-site.xml core-site.xml hdfs-site.xml /etc/
-COPY iceberg_rest.properties iceberg.properties hive.properties 
/etc/trino/catalog/
-
-# Expose the Trino port
-EXPOSE 9091
+connector.name=iceberg
+iceberg.catalog.type=rest
+iceberg.rest-catalog.uri=http://localhost:8181/catalog
+iceberg.rest-catalog.warehouse=lakekeeper_demo
+fs.hadoop.enabled=true
+hive.config.resources=/etc/hive-site.xml,/etc/hdfs-site.xml,/etc/core-site.xml
diff --git a/testdata/bin/minicluster_trino/Dockerfile 
b/testdata/bin/run-lakekeeper.sh
old mode 100644
new mode 100755
similarity index 57%
copy from testdata/bin/minicluster_trino/Dockerfile
copy to testdata/bin/run-lakekeeper.sh
index 041f03b34..559ba319b
--- a/testdata/bin/minicluster_trino/Dockerfile
+++ b/testdata/bin/run-lakekeeper.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,19 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Use an official Trino image as the base
-FROM trinodb/trino:latest
-
-# Use the developer username, so Trino will have write access to HDFS
-ARG USERNAME
-
-RUN \
-    sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' 
/etc/trino/config.properties && \
-    sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \
-    echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config
+# Copy cluster configs to trino docker directory.
+pushd ${HADOOP_CONF_DIR}
+cp core-site.xml hdfs-site.xml 
${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper
+popd
 
-COPY hive-site.xml core-site.xml hdfs-site.xml /etc/
-COPY iceberg_rest.properties iceberg.properties hive.properties 
/etc/trino/catalog/
+cd ${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper
 
-# Expose the Trino port
-EXPOSE 9091
+docker compose up -d
diff --git a/testdata/bin/minicluster_trino/Dockerfile 
b/testdata/bin/stop-lakekeeper.sh
old mode 100644
new mode 100755
similarity index 57%
copy from testdata/bin/minicluster_trino/Dockerfile
copy to testdata/bin/stop-lakekeeper.sh
index 041f03b34..ca8adad8e
--- a/testdata/bin/minicluster_trino/Dockerfile
+++ b/testdata/bin/stop-lakekeeper.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,19 +17,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Use an official Trino image as the base
-FROM trinodb/trino:latest
-
-# Use the developer username, so Trino will have write access to HDFS
-ARG USERNAME
-
-RUN \
-    sed -i 's/http-server.http.port=8080/http-server.http.port=9091/' 
/etc/trino/config.properties && \
-    sed -i 's/localhost:8080/localhost:9091/' /etc/trino/config.properties && \
-    echo "-DHADOOP_USER_NAME=$USERNAME" >> /etc/trino/jvm.config
-
-COPY hive-site.xml core-site.xml hdfs-site.xml /etc/
-COPY iceberg_rest.properties iceberg.properties hive.properties 
/etc/trino/catalog/
+cd ${IMPALA_HOME}/testdata/bin/minicluster_lakekeeper
 
-# Expose the Trino port
-EXPOSE 9091
+docker compose down

Reply via email to