This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch python
in repository https://gitbox.apache.org/repos/asf/datasketches-spark.git
The following commit(s) were added to refs/heads/python by this push:
new 4e8d151 WIP: modified build.sbt to copy ds-java jars, added setup.py
to include in sdist/wheels.
4e8d151 is described below
commit 4e8d1514f37031704c9b0fb473aa916cde43df9e
Author: Jon Malkin <[email protected]>
AuthorDate: Wed Feb 5 19:05:26 2025 -0800
WIP: modified build.sbt to copy ds-java jars, added setup.py to include in
sdist/wheels.
---
.gitignore | 24 +---
build.sbt | 153 ++++++++++++++----------
project/BuildUtils.scala | 40 +++++++
python/MANIFEST.in | 24 ++++
python/README.md | 24 ++++
python/setup.py | 99 +++++++++++++++
python/{ => src}/datasketches_spark/__init__.py | 2 +-
python/{ => src}/datasketches_spark/common.py | 23 ++++
python/{ => src}/datasketches_spark/kll.py | 0
version.cfg.in | 1 +
10 files changed, 301 insertions(+), 89 deletions(-)
diff --git a/.gitignore b/.gitignore
index fe02d20..38318e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,8 +14,9 @@
*.iws
# Scala project files and related plugins
+project/target
+project/build.properties
target/
-project/
.bsp/
.bloop/
.metals/
@@ -48,26 +49,5 @@ bin/
# virtual machine crash logs, see
http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
-#Test config and output
-test-output/
-local/
-reports/
-.pmd
-tmp/
-
-# Build artifacts
-target/
-serialization_test_data/
-out/
-build/
-jarsIn/
-build.xml
-*.properties
-*.releaseBackup
-*.next
-*.tag
-doc/
-
-
# Sketch binary test files
*.sk
\ No newline at end of file
diff --git a/build.sbt b/build.sbt
index 528bbad..38e2a86 100644
--- a/build.sbt
+++ b/build.sbt
@@ -1,4 +1,3 @@
-import scala.xml.dtd.DEFAULT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,29 +15,18 @@ import scala.xml.dtd.DEFAULT
* limitations under the License.
*/
-name := "datasketches-spark"
-version := "1.0-SNAPSHOT"
+import scala.xml.dtd.DEFAULT
+import scala.io.Source
+
+import BuildUtils._
val DEFAULT_SCALA_VERSION = "2.12.20"
val DEFAULT_SPARK_VERSION = "3.5.4"
-val DEFAULT_JDK_VERSION = "11"
-
-organization := "org.apache.datasketches"
-description := "The Apache DataSketches package for Spark"
-
-licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0"))
-
-scalaVersion := sys.env.getOrElse("SCALA_VERSION", DEFAULT_SCALA_VERSION)
-
-val sparkVersion = settingKey[String]("The version of Spark")
-sparkVersion := sys.env.getOrElse("SPARK_VERSION", DEFAULT_SPARK_VERSION)
-
-// determine our java version
-val jvmVersionString = settingKey[String]("The JVM version")
-jvmVersionString := sys.props("java.version")
+val DEFAULT_JDK_VERSION = "17"
// Map of JVM version prefix to:
// (JVM major version, datasketches-java version)
+// TODO: consider moving to external file
val jvmVersionMap = Map(
"21" -> ("21", "8.0.0"),
"17" -> ("17", "7.0.1"),
@@ -47,56 +35,89 @@ val jvmVersionMap = Map(
"1.8" -> ("8", "6.2.0")
)
-// determine the JVM major verison (default: 11)
+// version processing logic
+val scalaVersion = settingKey[String]("The version of Scala")
+val scalaVersionValue = sys.env.getOrElse("SCALA_VERSION",
DEFAULT_SCALA_VERSION)
+
+val sparkVersion = settingKey[String]("The version of Spark")
+val sparkVersionValue = sys.env.getOrElse("SPARK_VERSION",
DEFAULT_SPARK_VERSION)
+
+val jvmFullVersion = settingKey[String]("The JVM version")
+val jvmFullVersionValue = sys.props("java.version")
+
val jvmVersion = settingKey[String]("The JVM major version")
-jvmVersion := jvmVersionMap.collectFirst {
- case (prefix, (major, _)) if jvmVersionString.value.startsWith(prefix) =>
major
+val jvmVersionValue = jvmVersionMap.collectFirst {
+ case (prefix, (major, _)) if jvmFullVersionValue.startsWith(prefix) => major
}.getOrElse(DEFAULT_JDK_VERSION)
// look up the associated datasketches-java version
val dsJavaVersion = settingKey[String]("The DataSketches Java version")
-dsJavaVersion := jvmVersionMap.get(jvmVersion.value).map(_._2).get
-
-
-javacOptions ++= Seq("-source", jvmVersion.value, "-target", jvmVersion.value)
-scalacOptions ++= Seq("-encoding", "UTF-8", "-release", jvmVersion.value)
-Test / javacOptions ++= Seq("-source", jvmVersion.value, "-target",
jvmVersion.value)
-Test / scalacOptions ++= Seq("-encoding", "UTF-8", "-release",
jvmVersion.value)
-
-libraryDependencies ++= Seq(
- "org.apache.datasketches" % "datasketches-java" % dsJavaVersion.value %
"compile",
- "org.scala-lang" % "scala-library" % scalaVersion.value, // scala3-library
may need to use %%
- ("org.apache.spark" %% "spark-sql" % sparkVersion.value %
"provided").cross(CrossVersion.for3Use2_13),
- "org.scalatest" %% "scalatest" % "3.2.19" % "test",
- "org.scalatestplus" %% "junit-4-13" % "3.2.19.0" % "test"
-)
-
-Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD")
-
-// additional options for java 17
-Test / fork := {
- if (jvmVersion.value == "17") true
- else (Test / fork).value
-}
-
-Test / javaOptions ++= {
- if (jvmVersion.value == "17") {
- Seq("--add-modules=jdk.incubator.foreign",
- "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED"
- )
- } else {
- Seq.empty
- }
-}
-
-scalacOptions ++= Seq(
- "-deprecation",
- "-feature",
- "-unchecked",
- "-Xlint"
-)
-
-Test / logBuffered := false
-
-// Level.INFO is needed to see detailed output when running tests
-Test / logLevel := Level.Info
+val dsJavaVersionValue =
jvmVersionMap.get(jvmVersionValue).map(_._2).getOrElse("ERROR")
+
+lazy val copyDatasketchesDependencies = taskKey[Seq[File]]("Copy dependencies
to a known location")
+
+lazy val root = (project in file("."))
+ .settings(
+ name := "datasketches-spark",
+ version := readVersion("version.cfg.in"),
+ organization := "org.apache.datasketches",
+ description := "The Apache DataSketches package for Spark",
+ licenses += ("Apache-2.0",
url("http://www.apache.org/licenses/LICENSE-2.0")),
+ homepage := Some(url("https://datasketches.apache.org/")),
+ jvmVersion := jvmVersionValue,
+ dsJavaVersion := dsJavaVersionValue,
+ sparkVersion := sparkVersionValue,
+ scalaVersion := scalaVersionValue,
+ javacOptions ++= Seq("-source", jvmVersion.value, "-target",
jvmVersion.value),
+ scalacOptions ++= Seq("-encoding", "UTF-8", "-release", jvmVersion.value),
+ Test / javacOptions ++= Seq("-source", jvmVersion.value, "-target",
jvmVersion.value),
+ Test / scalacOptions ++= Seq(
+ "-encoding", "UTF-8",
+ "-release", jvmVersion.value,
+ "-deprecation",
+ "-feature",
+ "-unchecked",
+ "-Xlint"
+ ),
+ libraryDependencies ++= Seq(
+ "org.apache.datasketches" % "datasketches-java" % dsJavaVersion.value,
+ "org.scala-lang" % "scala-library" % scalaVersion.value % "provided", //
scala3-library may need to use %%
+ ("org.apache.spark" %% "spark-sql" % sparkVersion.value %
"provided").cross(CrossVersion.for3Use2_13),
+ "org.scalatest" %% "scalatest" % "3.2.19" % "test",
+ "org.scalatestplus" %% "junit-4-13" % "3.2.19.0" % "test"
+ ),
+ copyDatasketchesDependencies := {
+ // we want to copy non-provided/non-test dependencies to a known location
+ // so that they can be obtained easily
+ val targetLibDir = target.value / "lib"
+ IO.createDirectory(targetLibDir)
+ val dependencyJars = (Compile / dependencyClasspath).value.collect {
+ case attr if (attr.data.getName.startsWith("datasketches-java") ||
attr.data.getName.startsWith("datasketches-memory"))
+ && attr.data.getName.endsWith(".jar") =>
+ val file = attr.data
+ val targetFile = targetLibDir / file.getName
+ IO.copyFile(file, targetFile)
+ targetFile
+ }
+ dependencyJars
+ },
+ Compile / packageBin := (Compile /
packageBin).dependsOn(copyDatasketchesDependencies).value,
+ Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD"),
+ // additional options for java 17
+ Test / fork := {
+ if (jvmVersion.value == "17") true
+ else (Test / fork).value
+ },
+ Test / javaOptions ++= {
+ if (jvmVersion.value == "17") {
+ Seq("--add-modules=jdk.incubator.foreign",
+ "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED"
+ )
+ } else {
+ Seq.empty
+ }
+ },
+ Test / logBuffered := false,
+ // Level.INFO is needed to see detailed output when running tests
+ Test / logLevel := Level.Info
+ )
diff --git a/project/BuildUtils.scala b/project/BuildUtils.scala
new file mode 100644
index 0000000..40e22e6
--- /dev/null
+++ b/project/BuildUtils.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import scala.io.Source
+
+object BuildUtils {
+
+def generateVersionCfg(filename: String): Unit = {
+ val version = readVersion(filename + ".in")
+ val writer = new java.io.PrintWriter(filename) {
+ write(version)
+ close()
+ }
+}
+
+def readVersion(filename: String): String = {
+ // TODO: only generate version.cfg if version.cfg.in is newer
+ val bufferedSource = Source.fromFile(filename)
+ try {
+ bufferedSource.getLines.find(line => !line.trim.startsWith("#")).get
+ } finally {
+ bufferedSource.close()
+ }
+}
+
+}
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 0000000..fb0c1bf
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# build/config files
+include ../version.cfg.in
+
+# content files
+graft src
+graft test
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..7e7f2b9
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,24 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache<sup>®</sup> DataSketches™ PySpark Library
+
+This repo is still an early-stage work in progress.
+
+This is the PySpark plugin component.
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..ddb17b1
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#import importlib.util
+import glob
+import os
+import sys
+#import ctypes
+from setuptools import setup, find_packages
+#from setuptools.command.install import install
+from shutil import copyfile #, copytree, rmtree
+
+DS_SPARK_HOME = os.environ.get("DS_SPARK_HOME", os.path.abspath("../"))
+with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
+ VERSION = file.read().rstrip()
+TEMP_PATH = "src/datasketches_spark/deps" # we can store the relevant jars in
here
+
+# An error message if trying to run this without first building the jars
+missing_jars_message = """
+If you are trying to install the datasketches_spark Python package
+from source, you need to first build the jars.
+
+To build the jars, run the following command from the root directory of
+the repository:
+ sbt clean package
+Next, you can return to this diretory and resume.
+"""
+
+# Find the datasketches-spark jar path -- other dependencies handled separately
+DS_SPARK_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/scala-*/"))
+if len(DS_SPARK_JAR_PATH) == 1:
+ DS_SPARK_JAR_PATH = DS_SPARK_JAR_PATH[0]
+elif len(DS_SPARK_JAR_PATH) > 1:
+ print(
+ "Found jars for multiple scala versions ({0}). Please clean up the
target directory".format(
+ DS_SPARK_JAR_PATH
+ ),
+ file=sys.stderr
+ )
+ sys.exit(-1)
+elif len(DS_SPARK_JAR_PATH) == 0: # core spark also checks for TEMP_PATH --
unclear why?
+ print(missing_jars_message, file=sys.stderr)
+ sys.exit(-1)
+
+# Find the datasketches-java and datasketches-memory dependency jar path
+DS_JAVA_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
+if len(DS_JAVA_JAR_PATH) == 1:
+ DS_JAVA_JAR_PATH = DS_JAVA_JAR_PATH[0]
+else: # error if something other than 1 directory found
+ print(missing_jars_message, file=sys.stderr)
+ sys.exit(-1)
+
+# Copy the jars to the temporary directory
+# Future possible enhancement: symlink instead of copy
+try:
+ os.makedirs(TEMP_PATH)
+except OSError:
+ # we don't care if it already exists
+ pass
+
+# Copy the relevant jar files to temp path
+for jar_file in glob.glob(os.path.join(DS_SPARK_JAR_PATH,
f"datasketches-spark_*-{VERSION}.jar")):
+ copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH,
f"datasketches-java-*.jar")):
+ copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH,
f"datasketches-memory-*.jar")):
+ copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+
+setup(
+ name='datasketches_spark',
+ version=VERSION,
+ author='Apache Software Foundation',
+ author_email='[email protected]',
+ description='The Apache DataSketches Library for Python',
+ license='Apache License 2.0',
+ url='http://datasketches.apache.org',
+ long_description=open('README.md').read(),
+ long_description_content_type='text/markdown',
+ include_package_data=True,
+ package_dir={'':'src'},
+ packages=find_packages(where='src'),
+ install_requires=['pyspark'],
+ python_requires='>=3.8',
+ zip_safe=False
+)
diff --git a/python/datasketches_spark/__init__.py
b/python/src/datasketches_spark/__init__.py
similarity index 92%
rename from python/datasketches_spark/__init__.py
rename to python/src/datasketches_spark/__init__.py
index ef9effb..c51bb17 100644
--- a/python/datasketches_spark/__init__.py
+++ b/python/src/datasketches_spark/__init__.py
@@ -18,5 +18,5 @@ Provided under the Apache License, Version 2.0
name = 'datasketches_spark'
from .common import *
-from .common import _invoke_function_over_columns
+#from .common import _invoke_function_over_columns
from .kll import *
diff --git a/python/datasketches_spark/common.py
b/python/src/datasketches_spark/common.py
similarity index 79%
rename from python/datasketches_spark/common.py
rename to python/src/datasketches_spark/common.py
index aba63c3..93be8f2 100644
--- a/python/datasketches_spark/common.py
+++ b/python/src/datasketches_spark/common.py
@@ -15,9 +15,32 @@ from py4j.java_gateway import JavaClass
from typing import Any, TypeVar, Union, Callable
from functools import lru_cache
+import os
+import pkg_resources
+
ColumnOrName = Union[Column, str]
ColumnOrName_ = TypeVar("ColumnOrName_", bound=ColumnOrName)
+def get_jar_paths(*jar_names: str) -> list[str]:
+ """
+ Returns a list of absolute paths to the provided jars,\n
+ assuming they are included in the package.
+ :param jar_names: Names of jars to retrieve
+ :return: List of absolute paths to jars
+ """
+ jar_paths = []
+ for jar_name in jar_names:
+ try:
+ jar_path = pkg_resources.resource_filename(__name__,
f"deps/{jar_name}")
+ if os.path.exists(jar_path):
+ jar_paths.append(jar_path)
+ else:
+ raise FileNotFoundError(f"Jar {jar_name} not found in package")
+ except ValueError:
+ raise FileNotFoundError(f"Jar {jar_name} not found in package")
+ return jar_paths
+
+
# Since we have functions from different packages, rather than the
# single 16k+ line functions class in core Spark, we'll have each
# sketch family grab its own functions class from the JVM and cache it
diff --git a/python/datasketches_spark/kll.py
b/python/src/datasketches_spark/kll.py
similarity index 100%
rename from python/datasketches_spark/kll.py
rename to python/src/datasketches_spark/kll.py
diff --git a/version.cfg.in b/version.cfg.in
new file mode 100644
index 0000000..1821d33
--- /dev/null
+++ b/version.cfg.in
@@ -0,0 +1 @@
+0.1.0.dev0
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]