This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch python
in repository https://gitbox.apache.org/repos/asf/datasketches-spark.git


The following commit(s) were added to refs/heads/python by this push:
     new d63c5aa  cleaner build.sbt that better interacts with setting up 
python.  many changes to make python build work for wheel or sdist
d63c5aa is described below

commit d63c5aa1a7d6d4fa010aeaa8cf11a2dc586a707d
Author: Jon Malkin <[email protected]>
AuthorDate: Tue Feb 11 20:42:45 2025 -0800

    cleaner build.sbt that better interacts with setting up python.  many 
changes to make python build work for wheel or sdist
---
 build.sbt                                          | 40 ++++--------
 project/BuildUtils.scala                           | 75 +++++++++++++++++++---
 python/MANIFEST.in                                 |  8 +--
 python/pyproject.toml                              | 53 +++++++++++++++
 python/setup.py                                    | 73 +++++++++++----------
 python/src/datasketches_spark/__init__.py          |  7 ++
 .../datasketches_spark/_version.py}                | 13 ++--
 python/src/datasketches_spark/common.py            |  6 ++
 python/src/datasketches_spark/kll.py               |  6 ++
 python/MANIFEST.in => version.cfg                  |  8 +--
 version.cfg.in                                     |  1 -
 11 files changed, 198 insertions(+), 92 deletions(-)

diff --git a/build.sbt b/build.sbt
index 38e2a86..bff8af7 100644
--- a/build.sbt
+++ b/build.sbt
@@ -16,24 +16,12 @@
  */
 
 import scala.xml.dtd.DEFAULT
-import scala.io.Source
 
 import BuildUtils._
 
 val DEFAULT_SCALA_VERSION = "2.12.20"
 val DEFAULT_SPARK_VERSION = "3.5.4"
-val DEFAULT_JDK_VERSION = "17"
-
-// Map of JVM version prefix to:
-// (JVM major version, datasketches-java version)
-// TODO: consider moving to external file
-val jvmVersionMap = Map(
-  "21" -> ("21", "8.0.0"),
-  "17" -> ("17", "7.0.1"),
-  "11" -> ("11", "6.2.0"),
-  "8"  -> ("8",  "6.2.0"),
-  "1.8" -> ("8", "6.2.0")
-)
+val DEFAULT_JDK_VERSION = "11"
 
 // version processing logic
 val scalaVersion = settingKey[String]("The version of Scala")
@@ -56,10 +44,12 @@ val dsJavaVersionValue = 
jvmVersionMap.get(jvmVersionValue).map(_._2).getOrElse(
 
 lazy val copyDatasketchesDependencies = taskKey[Seq[File]]("Copy dependencies 
to a known location")
 
+lazy val cleanPythonVersionFile = taskKey[Unit]("Clean the python version 
file")
+
 lazy val root = (project in file("."))
   .settings(
     name := "datasketches-spark",
-    version := readVersion("version.cfg.in"),
+    version := readVersionAndCopyToPython("version.cfg"),
     organization := "org.apache.datasketches",
     description := "The Apache DataSketches package for Spark",
     licenses += ("Apache-2.0", 
url("http://www.apache.org/licenses/LICENSE-2.0";)),
@@ -90,18 +80,13 @@ lazy val root = (project in file("."))
       // we want to copy non-provided/non-test dependencies to a known location
       // so that they can be obtained easily
       val targetLibDir = target.value / "lib"
-      IO.createDirectory(targetLibDir)
-      val dependencyJars = (Compile / dependencyClasspath).value.collect {
-        case attr if (attr.data.getName.startsWith("datasketches-java") || 
attr.data.getName.startsWith("datasketches-memory"))
-                      && attr.data.getName.endsWith(".jar") =>
-          val file = attr.data
-          val targetFile = targetLibDir / file.getName
-          IO.copyFile(file, targetFile)
-          targetFile
-      }
-      dependencyJars
+      val listFile = targetLibDir / "dependencies.txt"
+      val dependencies = (Compile / dependencyClasspath).value
+      BuildUtils.copyDependenciesAndWriteList(targetLibDir, dependencies, 
listFile)
     },
-    Compile / packageBin := (Compile / 
packageBin).dependsOn(copyDatasketchesDependencies).value,
+    Compile / packageBin := (Compile / packageBin)
+      .dependsOn(copyDatasketchesDependencies)
+      .andFinally(readVersionAndCopyToPython("version.cfg")).value,
     Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD"),
     // additional options for java 17
     Test / fork := {
@@ -118,6 +103,7 @@ lazy val root = (project in file("."))
       }
     },
     Test / logBuffered := false,
-    // Level.INFO is needed to see detailed output when running tests
-    Test / logLevel := Level.Info
+    Test / logLevel := Level.Info,
+    cleanPythonVersionFile := BuildUtils.cleanPythonVersionFile(),
+    clean := clean.dependsOn(cleanPythonVersionFile).value
   )
diff --git a/project/BuildUtils.scala b/project/BuildUtils.scala
index 40e22e6..e0dd64a 100644
--- a/project/BuildUtils.scala
+++ b/project/BuildUtils.scala
@@ -15,26 +15,81 @@
  * limitations under the License.
  */
 
+import sbt._
+import java.io.{BufferedWriter, FileWriter}
 import scala.io.Source
 
 object BuildUtils {
 
-def generateVersionCfg(filename: String): Unit = {
-  val version = readVersion(filename + ".in")
-  val writer = new java.io.PrintWriter(filename) {
-    write(version)
-    close()
+// Map of JVM version prefix to:
+// (JVM major version, datasketches-java version)
+// TODO: consider moving to external file
+val jvmVersionMap = Map(
+  "21" -> ("21", "8.0.0"),
+  "17" -> ("17", "7.0.1"),
+  "11" -> ("11", "6.2.0"),
+  "8"  -> ("8",  "6.2.0"),
+  "1.8" -> ("8", "6.2.0")
+)
+
+// TODO: any way to avoid hardcoding this?
+//val pythonVersionFileName = "python/src/datasketches_spark/version.py"
+val pythonVersionFileName = "python/version.txt"
+
+// reads the version file, reformats as needed for python, and stores
+// in the python subdirectory as the __version__ function for the package
+def readVersionAndCopyToPython(filename: String): String = {
+  val bufferedSource = Source.fromFile(filename)
+  val version = try {
+    bufferedSource.getLines.find(line => !line.trim.startsWith("#") && 
!line.trim.isBlank()).get
+  } finally {
+    bufferedSource.close()
+  }
+
+  // write version to python subdirectory
+  val pyVersion = version.replace("-SNAPSHOT", ".dev0")
+  val writer = new BufferedWriter(new FileWriter(pythonVersionFileName))
+  try {
+    writer.write(pyVersion)
+    writer.newLine()
+  } finally {
+    writer.close()
   }
+  version
 }
 
-def readVersion(filename: String): String = {
-  // TODO: only generate version.cfg if version.cfg.in is newer
-  val bufferedSource = Source.fromFile(filename)
+// removes the python version file from the python subdir
+def cleanPythonVersionFile(): Unit = {
+  val pyFile = new File(pythonVersionFileName)
+  if (pyFile.exists()) {
+    pyFile.delete()
+  }
+}
+
+// copies the datasketches dependencies to a known location in /target
+def copyDependenciesAndWriteList(targetLibDir: File, dependencies: 
Seq[Attributed[File]], listFile: File): Seq[File] = {
+  IO.createDirectory(targetLibDir)
+  val dependencyJars = dependencies.collect {
+    case attr if (attr.data.getName.startsWith("datasketches-java") || 
attr.data.getName.startsWith("datasketches-memory"))
+              && attr.data.getName.endsWith(".jar") =>
+      val file = attr.data
+      val targetFile = targetLibDir / file.getName
+      IO.copyFile(file, targetFile)
+      targetFile
+  }
+
+  // write list of copied jars to file so we have full names/versions
+  val writer = new BufferedWriter(new FileWriter(listFile))
   try {
-    bufferedSource.getLines.find(line => !line.trim.startsWith("#")).get
+    dependencyJars.foreach { file =>
+      writer.write(file.getName)
+      writer.newLine()
+    }
   } finally {
-    bufferedSource.close()
+    writer.close()
   }
+
+  dependencyJars
 }
 
 }
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index fb0c1bf..1ef57bb 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -15,10 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+include version.txt
 
-# build/config files
-include ../version.cfg.in
-
-# content files
 graft src
-graft test
+graft tests
+graft src/datasketches_spark/deps/*
\ No newline at end of file
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..648773d
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "datasketches_spark"
+#version = { file = "version.txt" }
+dynamic = ["version"]
+description = "The Apache DataSketches Library for Python"
+authors = [
+  { name = "Apache Software Foundation", email = 
"[email protected]" }
+]
+license = { text = "Apache License 2.0" }
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+  "pyspark"
+]
+
+[tool.setuptools]
+package-dir = { "" = "src" }
+
+[tool.setuptools.dynamic]
+version = { file = "version.txt" }
+
+[tool.setuptools.package-data]
+datasketches_spark = ["deps/*"]
+
+[tools.setuptools.find]
+where = ["src"]
+include = ["datasketches_spark"]
+exclude = ["datasketches_spark.deps", "datasketches_spark.tests"]
+
+[tool.cibuildwheel]
+build-verbosity = 0  # options: 1, 2, or 3
+skip = ["cp36-*", "cp37-*", "cp38-*", "pp*", "*-win32"]
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index ddb17b1..d556086 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -15,18 +15,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#import importlib.util
 import glob
 import os
 import sys
-#import ctypes
 from setuptools import setup, find_packages
 #from setuptools.command.install import install
 from shutil import copyfile #, copytree, rmtree
 
 DS_SPARK_HOME = os.environ.get("DS_SPARK_HOME", os.path.abspath("../"))
-with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
-    VERSION = file.read().rstrip()
+#with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
+#    VERSION = file.read().rstrip()
 TEMP_PATH = "src/datasketches_spark/deps" # we can store the relevant jars in 
here
 
 # An error message if trying to run this without first building the jars
@@ -37,29 +35,24 @@ from source, you need to first build the jars.
 To build the jars, run the following command from the root directory of
 the repository:
     sbt clean package
-Next, you can return to this diretory and resume.
+
+If building for pyspark, you should build the jar with any versiion of
+Scala you may expect to use. The Scala verison can be set via the
+SCALA_VERSION environment variable.
+
+Then return to this diretory and resume building your sdist or wheel.
 """
 
 # Find the datasketches-spark jar path -- other dependencies handled separately
 DS_SPARK_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/scala-*/"))
-if len(DS_SPARK_JAR_PATH) == 1:
-    DS_SPARK_JAR_PATH = DS_SPARK_JAR_PATH[0]
-elif len(DS_SPARK_JAR_PATH) > 1:
-    print(
-        "Found jars for multiple scala versions ({0}). Please clean up the 
target directory".format(
-            DS_SPARK_JAR_PATH
-        ),
-        file=sys.stderr
-    )
-    sys.exit(-1)
-elif len(DS_SPARK_JAR_PATH) == 0: # core spark also checks for TEMP_PATH -- 
unclear why?
+if len(DS_SPARK_JAR_PATH) == 0:
     print(missing_jars_message, file=sys.stderr)
     sys.exit(-1)
 
 # Find the datasketches-java and datasketches-memory dependency jar path
-DS_JAVA_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
-if len(DS_JAVA_JAR_PATH) == 1:
-    DS_JAVA_JAR_PATH = DS_JAVA_JAR_PATH[0]
+DS_JAVA_LIB_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
+if len(DS_JAVA_LIB_PATH) == 1:
+    DS_JAVA_LIB_PATH = DS_JAVA_LIB_PATH[0]
 else: # error if something other than 1 directory found
     print(missing_jars_message, file=sys.stderr)
     sys.exit(-1)
@@ -73,27 +66,33 @@ except OSError:
     pass
 
 # Copy the relevant jar files to temp path
-for jar_file in glob.glob(os.path.join(DS_SPARK_JAR_PATH, 
f"datasketches-spark_*-{VERSION}.jar")):
+for path in DS_SPARK_JAR_PATH:
+    #for jar_file in glob.glob(os.path.join(path, 
f"datasketches-spark_*-{VERSION}.jar")):
+    for jar_file in glob.glob(os.path.join(path, f"datasketches-spark_*.jar")):
+        copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+
+# copy any ds-java and ds-memory jars, and dependencies.txt, too
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, 
f"datasketches-java-*.jar")):
     copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH, 
f"datasketches-java-*.jar")):
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, 
f"datasketches-memory-*.jar")):
     copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH, 
f"datasketches-memory-*.jar")):
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, f"dependencies.txt")):
     copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
 
 setup(
-    name='datasketches_spark',
-    version=VERSION,
-    author='Apache Software Foundation',
-    author_email='[email protected]',
-    description='The Apache DataSketches Library for Python',
-    license='Apache License 2.0',
-    url='http://datasketches.apache.org',
-    long_description=open('README.md').read(),
-    long_description_content_type='text/markdown',
-    include_package_data=True,
-    package_dir={'':'src'},
-    packages=find_packages(where='src'),
-    install_requires=['pyspark'],
-    python_requires='>=3.8',
-    zip_safe=False
+    #version = VERSION
+    # name='datasketches_spark',
+    # author='Apache Software Foundation',
+    # author_email='[email protected]',
+    # description='The Apache DataSketches Library for Python',
+    # license='Apache License 2.0',
+    # url='http://datasketches.apache.org',
+    # long_description=open('README.md').read(),
+    # long_description_content_type='text/markdown',
+    # include_package_data=True,
+    # package_dir={'':'src'},
+    # packages=find_packages(where='src'),
+    # install_requires=['pyspark'],
+    # python_requires='>=3.8',
+    # zip_safe=False
 )
diff --git a/python/src/datasketches_spark/__init__.py 
b/python/src/datasketches_spark/__init__.py
index c51bb17..7ac5919 100644
--- a/python/src/datasketches_spark/__init__.py
+++ b/python/src/datasketches_spark/__init__.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
@@ -17,6 +23,7 @@ Provided under the Apache License, Version 2.0
 
 name = 'datasketches_spark'
 
+from ._version import __version__
 from .common import *
 #from .common import _invoke_function_over_columns
 from .kll import *
diff --git a/python/MANIFEST.in b/python/src/datasketches_spark/_version.py
similarity index 78%
copy from python/MANIFEST.in
copy to python/src/datasketches_spark/_version.py
index fb0c1bf..424e4d1 100644
--- a/python/MANIFEST.in
+++ b/python/src/datasketches_spark/_version.py
@@ -15,10 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import importlib.resources
 
-# build/config files
-include ../version.cfg.in
+def get_version():
+  try:
+    with importlib.resources.open_text('datasketches_spark', 'version.txt') as 
f:
+      return f.read().strip()
+  except:
+    return '0.0.0'
 
-# content files
-graft src
-graft test
+__version__ = get_version()
diff --git a/python/src/datasketches_spark/common.py 
b/python/src/datasketches_spark/common.py
index 93be8f2..b812b40 100644
--- a/python/src/datasketches_spark/common.py
+++ b/python/src/datasketches_spark/common.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
diff --git a/python/src/datasketches_spark/kll.py 
b/python/src/datasketches_spark/kll.py
index 64a99e5..f89ef16 100644
--- a/python/src/datasketches_spark/kll.py
+++ b/python/src/datasketches_spark/kll.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
diff --git a/python/MANIFEST.in b/version.cfg
similarity index 90%
copy from python/MANIFEST.in
copy to version.cfg
index fb0c1bf..2a56927 100644
--- a/python/MANIFEST.in
+++ b/version.cfg
@@ -15,10 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
-# build/config files
-include ../version.cfg.in
-
-# content files
-graft src
-graft test
+0.1.0-SNAPSHOT
diff --git a/version.cfg.in b/version.cfg.in
deleted file mode 100644
index 1821d33..0000000
--- a/version.cfg.in
+++ /dev/null
@@ -1 +0,0 @@
-0.1.0.dev0


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to