This is an automated email from the ASF dual-hosted git repository.

jshao pushed a commit to branch branch-1.0
in repository https://gitbox.apache.org/repos/asf/gravitino.git


The following commit(s) were added to refs/heads/branch-1.0 by this push:
     new 83afcf1cd2 [#8481] improvement(Python-CI): remove download hadoop 
package task (#8517)
83afcf1cd2 is described below

commit 83afcf1cd2d10876f09635d5777b06e3707a4640
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Sep 11 12:58:27 2025 +0800

    [#8481] improvement(Python-CI): remove download hadoop package task (#8517)
    
    ### What changes were proposed in this pull request?
    
    - remove the download Hadoop package task from gradle
    - copy hadoop package from HDFS container
    
    ### Why are the changes needed?
    
    I noticed that the Python-CI is time-consuming, and the Hadoop package
    download task takes 2362.881 seconds.
    
    #8481
    
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    CI pass
    
    Co-authored-by: mchades <[email protected]>
---
 clients/client-python/build.gradle.kts             | 32 ++++--------------
 .../tests/integration/base_hadoop_env.py           | 39 +++++++++++++---------
 .../tests/integration/containers/base_container.py | 39 ++++++++++++++++++++++
 .../tests/integration/integration_test_env.py      |  4 +--
 .../tests/integration/test_gvfs_with_hdfs.py       | 25 +++++++++-----
 5 files changed, 87 insertions(+), 52 deletions(-)

diff --git a/clients/client-python/build.gradle.kts 
b/clients/client-python/build.gradle.kts
index a9873ea22c..c11f841ace 100644
--- a/clients/client-python/build.gradle.kts
+++ b/clients/client-python/build.gradle.kts
@@ -16,15 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-import de.undercouch.gradle.tasks.download.Download
-import de.undercouch.gradle.tasks.download.Verify
 import io.github.piyushroshan.python.VenvTask
 import java.net.HttpURLConnection
 import java.net.URL
 
 plugins {
   id("io.github.piyushroshan.python-gradle-miniforge-plugin") version "1.0.0"
-  id("de.undercouch.download") version "5.6.0"
 }
 
 pythonPlugin {
@@ -47,6 +44,7 @@ fun waitForServerIsReady(host: String = "http://localhost";, 
port: Int = 8090, ti
   val urlString = "$host:$port/metrics"
   val successPattern = Regex("\"version\"\\s*:")
 
+  println("Waiting for server to be ready at $urlString (timeout: ${timeout / 
1000}s)...")
   while (true) {
     try {
       val url = URL(urlString)
@@ -59,6 +57,8 @@ fun waitForServerIsReady(host: String = "http://localhost";, 
port: Int = 8090, ti
       if (responseCode == 200) {
         val response = connection.inputStream.bufferedReader().use { 
it.readText() }
         if (successPattern.containsMatchIn(response)) {
+          val duration = System.currentTimeMillis() - startTime
+          println("\nServer is ready! Took ${duration / 1000.0} seconds.")
           return  // If this succeeds, the API is up and running
         } else {
           exception = RuntimeException("API returned unexpected response: 
$response")
@@ -74,6 +74,10 @@ fun waitForServerIsReady(host: String = "http://localhost";, 
port: Int = 8090, ti
     if (System.currentTimeMillis() - startTime > timeout) {
       throw RuntimeException("Timed out waiting for API to be available", 
exception)
     }
+
+    // Print a dot to indicate waiting
+    print(".")
+    System.out.flush() // Ensure the dot is printed immediately
     Thread.sleep(500)  // Wait for 0.5 second before checking again
   }
 }
@@ -151,10 +155,6 @@ fun generatePypiProjectHomePage() {
   }
 }
 
-val hadoopVersion = "2.7.3"
-val hadoopPackName = "hadoop-${hadoopVersion}.tar.gz"
-val hadoopDirName = "hadoop-${hadoopVersion}"
-val hadoopDownloadUrl = 
"https://archive.apache.org/dist/hadoop/core/hadoop-${hadoopVersion}/${hadoopPackName}";
 tasks {
   val pipInstall by registering(VenvTask::class) {
     venvExec = "pip"
@@ -186,20 +186,6 @@ tasks {
     args = listOf("scripts/generate_version.py")
   }
 
-  val downloadHadoopPack by registering(Download::class) {
-    dependsOn(build)
-    onlyIfModified(true)
-    src(hadoopDownloadUrl)
-    dest(layout.buildDirectory.dir("tmp"))
-  }
-
-  val verifyHadoopPack by registering(Verify::class) {
-    dependsOn(downloadHadoopPack)
-    src(layout.buildDirectory.file("tmp/${hadoopPackName}"))
-    algorithm("MD5")
-    checksum("3455bb57e4b4906bbea67b58cca78fa8")
-  }
-
   val integrationTest by registering(VenvTask::class) {
     doFirst {
       gravitinoServer("start")
@@ -211,9 +197,7 @@ tasks {
     val dockerTest = project.rootProject.extra["dockerTest"] as? Boolean ?: 
false
     val envMap = mapOf<String, Any>().toMutableMap()
     if (dockerTest) {
-      dependsOn("verifyHadoopPack")
       envMap.putAll(mapOf(
-        "HADOOP_VERSION" to hadoopVersion,
         "PYTHON_BUILD_PATH" to project.rootDir.path + 
"/clients/client-python/build"
       ))
     }
@@ -233,8 +217,6 @@ tasks {
     doLast {
       gravitinoServer("stop")
     }
-
-    finalizedBy(integrationCoverageReport)
   }
 
   val unitCoverageReport by registering(VenvTask::class){
diff --git a/clients/client-python/tests/integration/base_hadoop_env.py 
b/clients/client-python/tests/integration/base_hadoop_env.py
index ef820a0f57..d29db03b3b 100644
--- a/clients/client-python/tests/integration/base_hadoop_env.py
+++ b/clients/client-python/tests/integration/base_hadoop_env.py
@@ -25,11 +25,11 @@ from gravitino.exceptions.base import 
GravitinoRuntimeException
 
 logger = logging.getLogger(__name__)
 
-HADOOP_VERSION = os.environ.get("HADOOP_VERSION")
-PYTHON_BUILD_PATH = os.environ.get("PYTHON_BUILD_PATH")
-
 
 class BaseHadoopEnvironment:
+    PYTHON_BUILD_PATH = os.environ.get("PYTHON_BUILD_PATH")
+    BASE_DIR_NAME = "hadoop-2.7.3"
+
     @classmethod
     def init_hadoop_env(cls):
         cls._unzip_hadoop_pack()
@@ -39,16 +39,16 @@ class BaseHadoopEnvironment:
     @classmethod
     def clear_hadoop_env(cls):
         try:
-            shutil.rmtree(f"{PYTHON_BUILD_PATH}/hadoop")
+            shutil.rmtree(f"{cls.PYTHON_BUILD_PATH}/hadoop")
         except Exception as e:
             raise GravitinoRuntimeException(
-                f"Failed to delete dir '{PYTHON_BUILD_PATH}/hadoop': {e}"
+                f"Failed to delete dir '{cls.PYTHON_BUILD_PATH}/hadoop': {e}"
             ) from e
 
     @classmethod
     def _unzip_hadoop_pack(cls):
-        hadoop_pack = f"{PYTHON_BUILD_PATH}/tmp/hadoop-{HADOOP_VERSION}.tar.gz"
-        unzip_dir = f"{PYTHON_BUILD_PATH}/hadoop"
+        hadoop_pack = f"{cls.PYTHON_BUILD_PATH}/tmp/{cls.BASE_DIR_NAME}.tar"
+        unzip_dir = f"{cls.PYTHON_BUILD_PATH}/hadoop"
         logger.info("Unzip hadoop pack from %s.", hadoop_pack)
         # unzip the pack
         if os.path.exists(unzip_dir):
@@ -70,13 +70,13 @@ class BaseHadoopEnvironment:
     def _configure_hadoop_environment(cls):
         logger.info("Configure hadoop environment.")
         os.putenv("HADOOP_USER_NAME", "anonymous")
-        os.putenv("HADOOP_HOME", 
f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}")
-        os.putenv(
-            "HADOOP_CONF_DIR",
-            f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}/etc/hadoop",
-        )
+        os.putenv("HADOOP_HOME", 
f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}")
+        conf_path = 
f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}/etc/hadoop"
+        os.putenv("HADOOP_CONF_DIR", conf_path)
+        # clean up the conf dir brought by the docker container, using default 
is enough
+        shutil.rmtree(conf_path)
         hadoop_shell_path = (
-            f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}/bin/hadoop"
+            f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}/bin/hadoop"
         )
         # get the classpath
         try:
@@ -86,11 +86,18 @@ class BaseHadoopEnvironment:
                 text=True,
                 check=True,
             )
-            if result.returncode == 0:
-                os.putenv("CLASSPATH", str(result.stdout))
+            classpath = ""
+            # there are some warning messages in the stdout, we need to parse 
the classpath line
+            for line in result.stdout.splitlines():
+                if line.strip().startswith("/"):
+                    classpath = line.strip()
+                    break
+
+            if classpath:
+                os.putenv("CLASSPATH", classpath)
             else:
                 raise GravitinoRuntimeException(
-                    f"Command failed with return code is not 0, stdout: 
{result.stdout}, stderr:{result.stderr}"
+                    f"Could not parse classpath from 'hadoop classpath --glob' 
command output: {result.stdout}"
                 )
         except subprocess.CalledProcessError as e:
             raise GravitinoRuntimeException(
diff --git 
a/clients/client-python/tests/integration/containers/base_container.py 
b/clients/client-python/tests/integration/containers/base_container.py
index 764d0f62da..ebc9eee018 100644
--- a/clients/client-python/tests/integration/containers/base_container.py
+++ b/clients/client-python/tests/integration/containers/base_container.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import logging
+import os
 from typing import Dict
 
 import docker
@@ -90,6 +91,44 @@ class BaseContainer:
     def get_ip(self):
         return self._ip
 
+    def get_tar_from_docker(self, src_path: str, dest_path: str):
+        """Copies a file or directory from the container to the host as a tar 
archive.
+
+        Args:
+            src_path: The source path inside the container.
+            dest_path: The destination file path on the host for the tar 
archive.
+        """
+        if self._container is None:
+            raise GravitinoRuntimeException(
+                f"The container {self._container_name} has not been 
initialized."
+            )
+
+        try:
+            bits, stat = self._container.get_archive(src_path)
+
+            if os.path.isdir(dest_path):
+                base_name = os.path.basename(src_path.strip("/"))
+                dest_file_path = os.path.join(dest_path, f"{base_name}.tar")
+            else:
+                dest_file_path = dest_path
+
+            # Write the tar stream directly to the destination file.
+            with open(dest_file_path, "wb") as f:
+                for chunk in bits:
+                    f.write(chunk)
+
+            logger.info(
+                f"Successfully copied '{src_path}' from container 
'{self._container_name}' to '{dest_file_path}'."
+            )
+
+        except Exception as e:
+            logger.error(
+                f"Failed to copy '{src_path}' from container 
'{self._container_name}' to '{dest_path}': {e}"
+            )
+            raise GravitinoRuntimeException(
+                f"Failed to copy directory from container: {e}"
+            ) from e
+
     def close(self):
         try:
             self._container.kill()
diff --git a/clients/client-python/tests/integration/integration_test_env.py 
b/clients/client-python/tests/integration/integration_test_env.py
index 308303e8a7..7b54f1f452 100644
--- a/clients/client-python/tests/integration/integration_test_env.py
+++ b/clients/client-python/tests/integration/integration_test_env.py
@@ -41,8 +41,8 @@ def get_gravitino_server_version(**kwargs):
         response.raise_for_status()  # raise an exception for bad status codes
         response.close()
         return True
-    except requests.exceptions.RequestException:
-        logger.warning("Failed to access the Gravitino server")
+    except requests.exceptions.RequestException as e:
+        logger.warning("Failed to access the Gravitino server: %s", e)
         return False
 
 
diff --git a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py 
b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
index 9f79b10b1a..922793a545 100644
--- a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
+++ b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
@@ -104,6 +104,13 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
 
         cls.hdfs_container = HDFSContainer()
         hdfs_container_ip = cls.hdfs_container.get_ip()
+        # copy hadoop tar from hdfs container
+        build_path = os.environ.get("PYTHON_BUILD_PATH")
+        dest_dir = os.path.join(build_path, "tmp")
+        os.makedirs(dest_dir, exist_ok=True)
+        cls.hdfs_container.get_tar_from_docker(
+            f"/opt/{BaseHadoopEnvironment.BASE_DIR_NAME}", dest_dir
+        )
         # init hadoop env
         BaseHadoopEnvironment.init_hadoop_env()
         cls.config = {
@@ -219,8 +226,8 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
                 cls.fileset_ident,
                 
catalog.as_fileset_catalog().drop_fileset(ident=cls.fileset_ident),
             )
-        except GravitinoRuntimeException:
-            logger.warning("Failed to drop fileset %s", cls.fileset_ident)
+        except GravitinoRuntimeException as e:
+            logger.warning("Failed to drop fileset %s: %s", cls.fileset_ident, 
str(e))
 
         try:
             logger.info(
@@ -230,8 +237,8 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
                     schema_name=cls.schema_name, cascade=True
                 ),
             )
-        except GravitinoRuntimeException:
-            logger.warning("Failed to drop schema %s", cls.schema_name)
+        except GravitinoRuntimeException as e:
+            logger.warning("Failed to drop schema %s: %s", cls.schema_name, 
str(e))
 
         try:
             logger.info(
@@ -239,17 +246,17 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
                 cls.catalog_name,
                 cls.gravitino_client.drop_catalog(name=cls.catalog_name, 
force=True),
             )
-        except GravitinoRuntimeException:
-            logger.warning("Failed to drop catalog %s", cls.catalog_name)
+        except GravitinoRuntimeException as e:
+            logger.warning("Failed to drop catalog %s: %s", cls.catalog_name, 
str(e))
 
         try:
             logger.info(
                 "Drop metalake %s[%s]",
                 cls.metalake_name,
-                cls.gravitino_admin_client.drop_metalake(cls.metalake_name),
+                cls.gravitino_admin_client.drop_metalake(cls.metalake_name, 
force=True),
             )
-        except GravitinoRuntimeException:
-            logger.warning("Failed to drop metalake %s", cls.metalake_name)
+        except GravitinoRuntimeException as e:
+            logger.warning("Failed to drop metalake %s: %s", 
cls.metalake_name, str(e))
 
     def test_simple_auth(self):
         options = {"auth_type": "simple"}

Reply via email to