This is an automated email from the ASF dual-hosted git repository.
jshao pushed a commit to branch branch-1.0
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/branch-1.0 by this push:
new 83afcf1cd2 [#8481] improvement(Python-CI): remove download hadoop
package task (#8517)
83afcf1cd2 is described below
commit 83afcf1cd2d10876f09635d5777b06e3707a4640
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Sep 11 12:58:27 2025 +0800
[#8481] improvement(Python-CI): remove download hadoop package task (#8517)
### What changes were proposed in this pull request?
- remove the download Hadoop package task from gradle
- copy hadoop package from HDFS container
### Why are the changes needed?
I noticed that the Python-CI is time-consuming, and the Hadoop package
download task takes 2362.881 seconds.
#8481
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
CI pass
Co-authored-by: mchades <[email protected]>
---
clients/client-python/build.gradle.kts | 32 ++++--------------
.../tests/integration/base_hadoop_env.py | 39 +++++++++++++---------
.../tests/integration/containers/base_container.py | 39 ++++++++++++++++++++++
.../tests/integration/integration_test_env.py | 4 +--
.../tests/integration/test_gvfs_with_hdfs.py | 25 +++++++++-----
5 files changed, 87 insertions(+), 52 deletions(-)
diff --git a/clients/client-python/build.gradle.kts
b/clients/client-python/build.gradle.kts
index a9873ea22c..c11f841ace 100644
--- a/clients/client-python/build.gradle.kts
+++ b/clients/client-python/build.gradle.kts
@@ -16,15 +16,12 @@
* specific language governing permissions and limitations
* under the License.
*/
-import de.undercouch.gradle.tasks.download.Download
-import de.undercouch.gradle.tasks.download.Verify
import io.github.piyushroshan.python.VenvTask
import java.net.HttpURLConnection
import java.net.URL
plugins {
id("io.github.piyushroshan.python-gradle-miniforge-plugin") version "1.0.0"
- id("de.undercouch.download") version "5.6.0"
}
pythonPlugin {
@@ -47,6 +44,7 @@ fun waitForServerIsReady(host: String = "http://localhost",
port: Int = 8090, ti
val urlString = "$host:$port/metrics"
val successPattern = Regex("\"version\"\\s*:")
+ println("Waiting for server to be ready at $urlString (timeout: ${timeout /
1000}s)...")
while (true) {
try {
val url = URL(urlString)
@@ -59,6 +57,8 @@ fun waitForServerIsReady(host: String = "http://localhost",
port: Int = 8090, ti
if (responseCode == 200) {
val response = connection.inputStream.bufferedReader().use {
it.readText() }
if (successPattern.containsMatchIn(response)) {
+ val duration = System.currentTimeMillis() - startTime
+ println("\nServer is ready! Took ${duration / 1000.0} seconds.")
return // If this succeeds, the API is up and running
} else {
exception = RuntimeException("API returned unexpected response:
$response")
@@ -74,6 +74,10 @@ fun waitForServerIsReady(host: String = "http://localhost",
port: Int = 8090, ti
if (System.currentTimeMillis() - startTime > timeout) {
throw RuntimeException("Timed out waiting for API to be available",
exception)
}
+
+ // Print a dot to indicate waiting
+ print(".")
+ System.out.flush() // Ensure the dot is printed immediately
Thread.sleep(500) // Wait for 0.5 second before checking again
}
}
@@ -151,10 +155,6 @@ fun generatePypiProjectHomePage() {
}
}
-val hadoopVersion = "2.7.3"
-val hadoopPackName = "hadoop-${hadoopVersion}.tar.gz"
-val hadoopDirName = "hadoop-${hadoopVersion}"
-val hadoopDownloadUrl =
"https://archive.apache.org/dist/hadoop/core/hadoop-${hadoopVersion}/${hadoopPackName}"
tasks {
val pipInstall by registering(VenvTask::class) {
venvExec = "pip"
@@ -186,20 +186,6 @@ tasks {
args = listOf("scripts/generate_version.py")
}
- val downloadHadoopPack by registering(Download::class) {
- dependsOn(build)
- onlyIfModified(true)
- src(hadoopDownloadUrl)
- dest(layout.buildDirectory.dir("tmp"))
- }
-
- val verifyHadoopPack by registering(Verify::class) {
- dependsOn(downloadHadoopPack)
- src(layout.buildDirectory.file("tmp/${hadoopPackName}"))
- algorithm("MD5")
- checksum("3455bb57e4b4906bbea67b58cca78fa8")
- }
-
val integrationTest by registering(VenvTask::class) {
doFirst {
gravitinoServer("start")
@@ -211,9 +197,7 @@ tasks {
val dockerTest = project.rootProject.extra["dockerTest"] as? Boolean ?:
false
val envMap = mapOf<String, Any>().toMutableMap()
if (dockerTest) {
- dependsOn("verifyHadoopPack")
envMap.putAll(mapOf(
- "HADOOP_VERSION" to hadoopVersion,
"PYTHON_BUILD_PATH" to project.rootDir.path +
"/clients/client-python/build"
))
}
@@ -233,8 +217,6 @@ tasks {
doLast {
gravitinoServer("stop")
}
-
- finalizedBy(integrationCoverageReport)
}
val unitCoverageReport by registering(VenvTask::class){
diff --git a/clients/client-python/tests/integration/base_hadoop_env.py
b/clients/client-python/tests/integration/base_hadoop_env.py
index ef820a0f57..d29db03b3b 100644
--- a/clients/client-python/tests/integration/base_hadoop_env.py
+++ b/clients/client-python/tests/integration/base_hadoop_env.py
@@ -25,11 +25,11 @@ from gravitino.exceptions.base import
GravitinoRuntimeException
logger = logging.getLogger(__name__)
-HADOOP_VERSION = os.environ.get("HADOOP_VERSION")
-PYTHON_BUILD_PATH = os.environ.get("PYTHON_BUILD_PATH")
-
class BaseHadoopEnvironment:
+ PYTHON_BUILD_PATH = os.environ.get("PYTHON_BUILD_PATH")
+ BASE_DIR_NAME = "hadoop-2.7.3"
+
@classmethod
def init_hadoop_env(cls):
cls._unzip_hadoop_pack()
@@ -39,16 +39,16 @@ class BaseHadoopEnvironment:
@classmethod
def clear_hadoop_env(cls):
try:
- shutil.rmtree(f"{PYTHON_BUILD_PATH}/hadoop")
+ shutil.rmtree(f"{cls.PYTHON_BUILD_PATH}/hadoop")
except Exception as e:
raise GravitinoRuntimeException(
- f"Failed to delete dir '{PYTHON_BUILD_PATH}/hadoop': {e}"
+ f"Failed to delete dir '{cls.PYTHON_BUILD_PATH}/hadoop': {e}"
) from e
@classmethod
def _unzip_hadoop_pack(cls):
- hadoop_pack = f"{PYTHON_BUILD_PATH}/tmp/hadoop-{HADOOP_VERSION}.tar.gz"
- unzip_dir = f"{PYTHON_BUILD_PATH}/hadoop"
+ hadoop_pack = f"{cls.PYTHON_BUILD_PATH}/tmp/{cls.BASE_DIR_NAME}.tar"
+ unzip_dir = f"{cls.PYTHON_BUILD_PATH}/hadoop"
logger.info("Unzip hadoop pack from %s.", hadoop_pack)
# unzip the pack
if os.path.exists(unzip_dir):
@@ -70,13 +70,13 @@ class BaseHadoopEnvironment:
def _configure_hadoop_environment(cls):
logger.info("Configure hadoop environment.")
os.putenv("HADOOP_USER_NAME", "anonymous")
- os.putenv("HADOOP_HOME",
f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}")
- os.putenv(
- "HADOOP_CONF_DIR",
- f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}/etc/hadoop",
- )
+ os.putenv("HADOOP_HOME",
f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}")
+ conf_path =
f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}/etc/hadoop"
+ os.putenv("HADOOP_CONF_DIR", conf_path)
+ # clean up the conf dir brought by the docker container, using default
is enough
+ shutil.rmtree(conf_path)
hadoop_shell_path = (
- f"{PYTHON_BUILD_PATH}/hadoop/hadoop-{HADOOP_VERSION}/bin/hadoop"
+ f"{cls.PYTHON_BUILD_PATH}/hadoop/{cls.BASE_DIR_NAME}/bin/hadoop"
)
# get the classpath
try:
@@ -86,11 +86,18 @@ class BaseHadoopEnvironment:
text=True,
check=True,
)
- if result.returncode == 0:
- os.putenv("CLASSPATH", str(result.stdout))
+ classpath = ""
+ # there are some warning messages in the stdout, we need to parse
the classpath line
+ for line in result.stdout.splitlines():
+ if line.strip().startswith("/"):
+ classpath = line.strip()
+ break
+
+ if classpath:
+ os.putenv("CLASSPATH", classpath)
else:
raise GravitinoRuntimeException(
- f"Command failed with return code is not 0, stdout:
{result.stdout}, stderr:{result.stderr}"
+ f"Could not parse classpath from 'hadoop classpath --glob'
command output: {result.stdout}"
)
except subprocess.CalledProcessError as e:
raise GravitinoRuntimeException(
diff --git
a/clients/client-python/tests/integration/containers/base_container.py
b/clients/client-python/tests/integration/containers/base_container.py
index 764d0f62da..ebc9eee018 100644
--- a/clients/client-python/tests/integration/containers/base_container.py
+++ b/clients/client-python/tests/integration/containers/base_container.py
@@ -16,6 +16,7 @@
# under the License.
import logging
+import os
from typing import Dict
import docker
@@ -90,6 +91,44 @@ class BaseContainer:
def get_ip(self):
return self._ip
+ def get_tar_from_docker(self, src_path: str, dest_path: str):
+ """Copies a file or directory from the container to the host as a tar
archive.
+
+ Args:
+ src_path: The source path inside the container.
+ dest_path: The destination file path on the host for the tar
archive.
+ """
+ if self._container is None:
+ raise GravitinoRuntimeException(
+ f"The container {self._container_name} has not been
initialized."
+ )
+
+ try:
+ bits, stat = self._container.get_archive(src_path)
+
+ if os.path.isdir(dest_path):
+ base_name = os.path.basename(src_path.strip("/"))
+ dest_file_path = os.path.join(dest_path, f"{base_name}.tar")
+ else:
+ dest_file_path = dest_path
+
+ # Write the tar stream directly to the destination file.
+ with open(dest_file_path, "wb") as f:
+ for chunk in bits:
+ f.write(chunk)
+
+ logger.info(
+ f"Successfully copied '{src_path}' from container
'{self._container_name}' to '{dest_file_path}'."
+ )
+
+ except Exception as e:
+ logger.error(
+ f"Failed to copy '{src_path}' from container
'{self._container_name}' to '{dest_path}': {e}"
+ )
+ raise GravitinoRuntimeException(
+ f"Failed to copy directory from container: {e}"
+ ) from e
+
def close(self):
try:
self._container.kill()
diff --git a/clients/client-python/tests/integration/integration_test_env.py
b/clients/client-python/tests/integration/integration_test_env.py
index 308303e8a7..7b54f1f452 100644
--- a/clients/client-python/tests/integration/integration_test_env.py
+++ b/clients/client-python/tests/integration/integration_test_env.py
@@ -41,8 +41,8 @@ def get_gravitino_server_version(**kwargs):
response.raise_for_status() # raise an exception for bad status codes
response.close()
return True
- except requests.exceptions.RequestException:
- logger.warning("Failed to access the Gravitino server")
+ except requests.exceptions.RequestException as e:
+ logger.warning("Failed to access the Gravitino server: %s", e)
return False
diff --git a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
index 9f79b10b1a..922793a545 100644
--- a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
+++ b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
@@ -104,6 +104,13 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
cls.hdfs_container = HDFSContainer()
hdfs_container_ip = cls.hdfs_container.get_ip()
+ # copy hadoop tar from hdfs container
+ build_path = os.environ.get("PYTHON_BUILD_PATH")
+ dest_dir = os.path.join(build_path, "tmp")
+ os.makedirs(dest_dir, exist_ok=True)
+ cls.hdfs_container.get_tar_from_docker(
+ f"/opt/{BaseHadoopEnvironment.BASE_DIR_NAME}", dest_dir
+ )
# init hadoop env
BaseHadoopEnvironment.init_hadoop_env()
cls.config = {
@@ -219,8 +226,8 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
cls.fileset_ident,
catalog.as_fileset_catalog().drop_fileset(ident=cls.fileset_ident),
)
- except GravitinoRuntimeException:
- logger.warning("Failed to drop fileset %s", cls.fileset_ident)
+ except GravitinoRuntimeException as e:
+ logger.warning("Failed to drop fileset %s: %s", cls.fileset_ident,
str(e))
try:
logger.info(
@@ -230,8 +237,8 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
schema_name=cls.schema_name, cascade=True
),
)
- except GravitinoRuntimeException:
- logger.warning("Failed to drop schema %s", cls.schema_name)
+ except GravitinoRuntimeException as e:
+ logger.warning("Failed to drop schema %s: %s", cls.schema_name,
str(e))
try:
logger.info(
@@ -239,17 +246,17 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
cls.catalog_name,
cls.gravitino_client.drop_catalog(name=cls.catalog_name,
force=True),
)
- except GravitinoRuntimeException:
- logger.warning("Failed to drop catalog %s", cls.catalog_name)
+ except GravitinoRuntimeException as e:
+ logger.warning("Failed to drop catalog %s: %s", cls.catalog_name,
str(e))
try:
logger.info(
"Drop metalake %s[%s]",
cls.metalake_name,
- cls.gravitino_admin_client.drop_metalake(cls.metalake_name),
+ cls.gravitino_admin_client.drop_metalake(cls.metalake_name,
force=True),
)
- except GravitinoRuntimeException:
- logger.warning("Failed to drop metalake %s", cls.metalake_name)
+ except GravitinoRuntimeException as e:
+ logger.warning("Failed to drop metalake %s: %s",
cls.metalake_name, str(e))
def test_simple_auth(self):
options = {"auth_type": "simple"}