This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch python
in repository https://gitbox.apache.org/repos/asf/datasketches-spark.git
The following commit(s) were added to refs/heads/python by this push:
new d63c5aa cleaner build.sbt that better interacts with setting up
python. many changes to make python build work for wheel or sdist
d63c5aa is described below
commit d63c5aa1a7d6d4fa010aeaa8cf11a2dc586a707d
Author: Jon Malkin <[email protected]>
AuthorDate: Tue Feb 11 20:42:45 2025 -0800
cleaner build.sbt that better interacts with setting up python. many
changes to make python build work for wheel or sdist
---
build.sbt | 40 ++++--------
project/BuildUtils.scala | 75 +++++++++++++++++++---
python/MANIFEST.in | 8 +--
python/pyproject.toml | 53 +++++++++++++++
python/setup.py | 73 +++++++++++----------
python/src/datasketches_spark/__init__.py | 7 ++
.../datasketches_spark/_version.py} | 13 ++--
python/src/datasketches_spark/common.py | 6 ++
python/src/datasketches_spark/kll.py | 6 ++
python/MANIFEST.in => version.cfg | 8 +--
version.cfg.in | 1 -
11 files changed, 198 insertions(+), 92 deletions(-)
diff --git a/build.sbt b/build.sbt
index 38e2a86..bff8af7 100644
--- a/build.sbt
+++ b/build.sbt
@@ -16,24 +16,12 @@
*/
import scala.xml.dtd.DEFAULT
-import scala.io.Source
import BuildUtils._
val DEFAULT_SCALA_VERSION = "2.12.20"
val DEFAULT_SPARK_VERSION = "3.5.4"
-val DEFAULT_JDK_VERSION = "17"
-
-// Map of JVM version prefix to:
-// (JVM major version, datasketches-java version)
-// TODO: consider moving to external file
-val jvmVersionMap = Map(
- "21" -> ("21", "8.0.0"),
- "17" -> ("17", "7.0.1"),
- "11" -> ("11", "6.2.0"),
- "8" -> ("8", "6.2.0"),
- "1.8" -> ("8", "6.2.0")
-)
+val DEFAULT_JDK_VERSION = "11"
// version processing logic
val scalaVersion = settingKey[String]("The version of Scala")
@@ -56,10 +44,12 @@ val dsJavaVersionValue =
jvmVersionMap.get(jvmVersionValue).map(_._2).getOrElse(
lazy val copyDatasketchesDependencies = taskKey[Seq[File]]("Copy dependencies
to a known location")
+lazy val cleanPythonVersionFile = taskKey[Unit]("Clean the python version
file")
+
lazy val root = (project in file("."))
.settings(
name := "datasketches-spark",
- version := readVersion("version.cfg.in"),
+ version := readVersionAndCopyToPython("version.cfg"),
organization := "org.apache.datasketches",
description := "The Apache DataSketches package for Spark",
licenses += ("Apache-2.0",
url("http://www.apache.org/licenses/LICENSE-2.0")),
@@ -90,18 +80,13 @@ lazy val root = (project in file("."))
// we want to copy non-provided/non-test dependencies to a known location
// so that they can be obtained easily
val targetLibDir = target.value / "lib"
- IO.createDirectory(targetLibDir)
- val dependencyJars = (Compile / dependencyClasspath).value.collect {
- case attr if (attr.data.getName.startsWith("datasketches-java") ||
attr.data.getName.startsWith("datasketches-memory"))
- && attr.data.getName.endsWith(".jar") =>
- val file = attr.data
- val targetFile = targetLibDir / file.getName
- IO.copyFile(file, targetFile)
- targetFile
- }
- dependencyJars
+ val listFile = targetLibDir / "dependencies.txt"
+ val dependencies = (Compile / dependencyClasspath).value
+ BuildUtils.copyDependenciesAndWriteList(targetLibDir, dependencies,
listFile)
},
- Compile / packageBin := (Compile /
packageBin).dependsOn(copyDatasketchesDependencies).value,
+ Compile / packageBin := (Compile / packageBin)
+ .dependsOn(copyDatasketchesDependencies)
+ .andFinally(readVersionAndCopyToPython("version.cfg")).value,
Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD"),
// additional options for java 17
Test / fork := {
@@ -118,6 +103,7 @@ lazy val root = (project in file("."))
}
},
Test / logBuffered := false,
- // Level.INFO is needed to see detailed output when running tests
- Test / logLevel := Level.Info
+ Test / logLevel := Level.Info,
+ cleanPythonVersionFile := BuildUtils.cleanPythonVersionFile(),
+ clean := clean.dependsOn(cleanPythonVersionFile).value
)
diff --git a/project/BuildUtils.scala b/project/BuildUtils.scala
index 40e22e6..e0dd64a 100644
--- a/project/BuildUtils.scala
+++ b/project/BuildUtils.scala
@@ -15,26 +15,81 @@
* limitations under the License.
*/
+import sbt._
+import java.io.{BufferedWriter, FileWriter}
import scala.io.Source
object BuildUtils {
-def generateVersionCfg(filename: String): Unit = {
- val version = readVersion(filename + ".in")
- val writer = new java.io.PrintWriter(filename) {
- write(version)
- close()
+// Map of JVM version prefix to:
+// (JVM major version, datasketches-java version)
+// TODO: consider moving to external file
+val jvmVersionMap = Map(
+ "21" -> ("21", "8.0.0"),
+ "17" -> ("17", "7.0.1"),
+ "11" -> ("11", "6.2.0"),
+ "8" -> ("8", "6.2.0"),
+ "1.8" -> ("8", "6.2.0")
+)
+
+// TODO: any way to avoid hardcoding this?
+//val pythonVersionFileName = "python/src/datasketches_spark/version.py"
+val pythonVersionFileName = "python/version.txt"
+
+// reads the version file, reformats as needed for python, and stores
+// in the python subdirectory as the __version__ function for the package
+def readVersionAndCopyToPython(filename: String): String = {
+ val bufferedSource = Source.fromFile(filename)
+ val version = try {
+ bufferedSource.getLines.find(line => !line.trim.startsWith("#") &&
!line.trim.isBlank()).get
+ } finally {
+ bufferedSource.close()
+ }
+
+ // write version to python subdirectory
+ val pyVersion = version.replace("-SNAPSHOT", ".dev0")
+ val writer = new BufferedWriter(new FileWriter(pythonVersionFileName))
+ try {
+ writer.write(pyVersion)
+ writer.newLine()
+ } finally {
+ writer.close()
}
+ version
}
-def readVersion(filename: String): String = {
- // TODO: only generate version.cfg if version.cfg.in is newer
- val bufferedSource = Source.fromFile(filename)
+// removes the python version file from the python subdir
+def cleanPythonVersionFile(): Unit = {
+ val pyFile = new File(pythonVersionFileName)
+ if (pyFile.exists()) {
+ pyFile.delete()
+ }
+}
+
+// copies the datasketches dependencies to a known location in /target
+def copyDependenciesAndWriteList(targetLibDir: File, dependencies:
Seq[Attributed[File]], listFile: File): Seq[File] = {
+ IO.createDirectory(targetLibDir)
+ val dependencyJars = dependencies.collect {
+ case attr if (attr.data.getName.startsWith("datasketches-java") ||
attr.data.getName.startsWith("datasketches-memory"))
+ && attr.data.getName.endsWith(".jar") =>
+ val file = attr.data
+ val targetFile = targetLibDir / file.getName
+ IO.copyFile(file, targetFile)
+ targetFile
+ }
+
+ // write list of copied jars to file so we have full names/versions
+ val writer = new BufferedWriter(new FileWriter(listFile))
try {
- bufferedSource.getLines.find(line => !line.trim.startsWith("#")).get
+ dependencyJars.foreach { file =>
+ writer.write(file.getName)
+ writer.newLine()
+ }
} finally {
- bufferedSource.close()
+ writer.close()
}
+
+ dependencyJars
}
}
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index fb0c1bf..1ef57bb 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -15,10 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+include version.txt
-# build/config files
-include ../version.cfg.in
-
-# content files
graft src
-graft test
+graft tests
+graft src/datasketches_spark/deps/*
\ No newline at end of file
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..648773d
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "datasketches_spark"
+#version = { file = "version.txt" }
+dynamic = ["version"]
+description = "The Apache DataSketches Library for Python"
+authors = [
+ { name = "Apache Software Foundation", email =
"[email protected]" }
+]
+license = { text = "Apache License 2.0" }
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+ "pyspark"
+]
+
+[tool.setuptools]
+package-dir = { "" = "src" }
+
+[tool.setuptools.dynamic]
+version = { file = "version.txt" }
+
+[tool.setuptools.package-data]
+datasketches_spark = ["deps/*"]
+
+[tools.setuptools.find]
+where = ["src"]
+include = ["datasketches_spark"]
+exclude = ["datasketches_spark.deps", "datasketches_spark.tests"]
+
+[tool.cibuildwheel]
+build-verbosity = 0 # options: 1, 2, or 3
+skip = ["cp36-*", "cp37-*", "cp38-*", "pp*", "*-win32"]
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index ddb17b1..d556086 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -15,18 +15,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-#import importlib.util
import glob
import os
import sys
-#import ctypes
from setuptools import setup, find_packages
#from setuptools.command.install import install
from shutil import copyfile #, copytree, rmtree
DS_SPARK_HOME = os.environ.get("DS_SPARK_HOME", os.path.abspath("../"))
-with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
- VERSION = file.read().rstrip()
+#with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
+# VERSION = file.read().rstrip()
TEMP_PATH = "src/datasketches_spark/deps" # we can store the relevant jars in
here
# An error message if trying to run this without first building the jars
@@ -37,29 +35,24 @@ from source, you need to first build the jars.
To build the jars, run the following command from the root directory of
the repository:
sbt clean package
-Next, you can return to this diretory and resume.
+
+If building for pyspark, you should build the jar with any versiion of
+Scala you may expect to use. The Scala verison can be set via the
+SCALA_VERSION environment variable.
+
+Then return to this diretory and resume building your sdist or wheel.
"""
# Find the datasketches-spark jar path -- other dependencies handled separately
DS_SPARK_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/scala-*/"))
-if len(DS_SPARK_JAR_PATH) == 1:
- DS_SPARK_JAR_PATH = DS_SPARK_JAR_PATH[0]
-elif len(DS_SPARK_JAR_PATH) > 1:
- print(
- "Found jars for multiple scala versions ({0}). Please clean up the
target directory".format(
- DS_SPARK_JAR_PATH
- ),
- file=sys.stderr
- )
- sys.exit(-1)
-elif len(DS_SPARK_JAR_PATH) == 0: # core spark also checks for TEMP_PATH --
unclear why?
+if len(DS_SPARK_JAR_PATH) == 0:
print(missing_jars_message, file=sys.stderr)
sys.exit(-1)
# Find the datasketches-java and datasketches-memory dependency jar path
-DS_JAVA_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
-if len(DS_JAVA_JAR_PATH) == 1:
- DS_JAVA_JAR_PATH = DS_JAVA_JAR_PATH[0]
+DS_JAVA_LIB_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
+if len(DS_JAVA_LIB_PATH) == 1:
+ DS_JAVA_LIB_PATH = DS_JAVA_LIB_PATH[0]
else: # error if something other than 1 directory found
print(missing_jars_message, file=sys.stderr)
sys.exit(-1)
@@ -73,27 +66,33 @@ except OSError:
pass
# Copy the relevant jar files to temp path
-for jar_file in glob.glob(os.path.join(DS_SPARK_JAR_PATH,
f"datasketches-spark_*-{VERSION}.jar")):
+for path in DS_SPARK_JAR_PATH:
+ #for jar_file in glob.glob(os.path.join(path,
f"datasketches-spark_*-{VERSION}.jar")):
+ for jar_file in glob.glob(os.path.join(path, f"datasketches-spark_*.jar")):
+ copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+
+# copy any ds-java and ds-memory jars, and dependencies.txt, too
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH,
f"datasketches-java-*.jar")):
copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH,
f"datasketches-java-*.jar")):
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH,
f"datasketches-memory-*.jar")):
copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH,
f"datasketches-memory-*.jar")):
+for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, f"dependencies.txt")):
copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
setup(
- name='datasketches_spark',
- version=VERSION,
- author='Apache Software Foundation',
- author_email='[email protected]',
- description='The Apache DataSketches Library for Python',
- license='Apache License 2.0',
- url='http://datasketches.apache.org',
- long_description=open('README.md').read(),
- long_description_content_type='text/markdown',
- include_package_data=True,
- package_dir={'':'src'},
- packages=find_packages(where='src'),
- install_requires=['pyspark'],
- python_requires='>=3.8',
- zip_safe=False
+ #version = VERSION
+ # name='datasketches_spark',
+ # author='Apache Software Foundation',
+ # author_email='[email protected]',
+ # description='The Apache DataSketches Library for Python',
+ # license='Apache License 2.0',
+ # url='http://datasketches.apache.org',
+ # long_description=open('README.md').read(),
+ # long_description_content_type='text/markdown',
+ # include_package_data=True,
+ # package_dir={'':'src'},
+ # packages=find_packages(where='src'),
+ # install_requires=['pyspark'],
+ # python_requires='>=3.8',
+ # zip_safe=False
)
diff --git a/python/src/datasketches_spark/__init__.py
b/python/src/datasketches_spark/__init__.py
index c51bb17..7ac5919 100644
--- a/python/src/datasketches_spark/__init__.py
+++ b/python/src/datasketches_spark/__init__.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
@@ -17,6 +23,7 @@ Provided under the Apache License, Version 2.0
name = 'datasketches_spark'
+from ._version import __version__
from .common import *
#from .common import _invoke_function_over_columns
from .kll import *
diff --git a/python/MANIFEST.in b/python/src/datasketches_spark/_version.py
similarity index 78%
copy from python/MANIFEST.in
copy to python/src/datasketches_spark/_version.py
index fb0c1bf..424e4d1 100644
--- a/python/MANIFEST.in
+++ b/python/src/datasketches_spark/_version.py
@@ -15,10 +15,13 @@
# specific language governing permissions and limitations
# under the License.
+import importlib.resources
-# build/config files
-include ../version.cfg.in
+def get_version():
+ try:
+ with importlib.resources.open_text('datasketches_spark', 'version.txt') as
f:
+ return f.read().strip()
+ except:
+ return '0.0.0'
-# content files
-graft src
-graft test
+__version__ = get_version()
diff --git a/python/src/datasketches_spark/common.py
b/python/src/datasketches_spark/common.py
index 93be8f2..b812b40 100644
--- a/python/src/datasketches_spark/common.py
+++ b/python/src/datasketches_spark/common.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
diff --git a/python/src/datasketches_spark/kll.py
b/python/src/datasketches_spark/kll.py
index 64a99e5..f89ef16 100644
--- a/python/src/datasketches_spark/kll.py
+++ b/python/src/datasketches_spark/kll.py
@@ -1,3 +1,9 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
diff --git a/python/MANIFEST.in b/version.cfg
similarity index 90%
copy from python/MANIFEST.in
copy to version.cfg
index fb0c1bf..2a56927 100644
--- a/python/MANIFEST.in
+++ b/version.cfg
@@ -15,10 +15,4 @@
# specific language governing permissions and limitations
# under the License.
-
-# build/config files
-include ../version.cfg.in
-
-# content files
-graft src
-graft test
+0.1.0-SNAPSHOT
diff --git a/version.cfg.in b/version.cfg.in
deleted file mode 100644
index 1821d33..0000000
--- a/version.cfg.in
+++ /dev/null
@@ -1 +0,0 @@
-0.1.0.dev0
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]