This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch python
in repository https://gitbox.apache.org/repos/asf/datasketches-spark.git


The following commit(s) were added to refs/heads/python by this push:
     new 4e8d151  WIP: modified build.sbt to copy ds-java jars, added setup.py 
to include in sdist/wheels.
4e8d151 is described below

commit 4e8d1514f37031704c9b0fb473aa916cde43df9e
Author: Jon Malkin <[email protected]>
AuthorDate: Wed Feb 5 19:05:26 2025 -0800

    WIP: modified build.sbt to copy ds-java jars, added setup.py to include in 
sdist/wheels.
---
 .gitignore                                      |  24 +---
 build.sbt                                       | 153 ++++++++++++++----------
 project/BuildUtils.scala                        |  40 +++++++
 python/MANIFEST.in                              |  24 ++++
 python/README.md                                |  24 ++++
 python/setup.py                                 |  99 +++++++++++++++
 python/{ => src}/datasketches_spark/__init__.py |   2 +-
 python/{ => src}/datasketches_spark/common.py   |  23 ++++
 python/{ => src}/datasketches_spark/kll.py      |   0
 version.cfg.in                                  |   1 +
 10 files changed, 301 insertions(+), 89 deletions(-)

diff --git a/.gitignore b/.gitignore
index fe02d20..38318e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,8 +14,9 @@
 *.iws
 
 # Scala project files and related plugins
+project/target
+project/build.properties
 target/
-project/
 .bsp/
 .bloop/
 .metals/
@@ -48,26 +49,5 @@ bin/
 # virtual machine crash logs, see 
http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
 
-#Test config and output
-test-output/
-local/
-reports/
-.pmd
-tmp/
-
-# Build artifacts
-target/
-serialization_test_data/
-out/
-build/
-jarsIn/
-build.xml
-*.properties
-*.releaseBackup
-*.next
-*.tag
-doc/
-
-
 # Sketch binary test files
 *.sk
\ No newline at end of file
diff --git a/build.sbt b/build.sbt
index 528bbad..38e2a86 100644
--- a/build.sbt
+++ b/build.sbt
@@ -1,4 +1,3 @@
-import scala.xml.dtd.DEFAULT
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,29 +15,18 @@ import scala.xml.dtd.DEFAULT
  * limitations under the License.
  */
 
-name := "datasketches-spark"
-version := "1.0-SNAPSHOT"
+import scala.xml.dtd.DEFAULT
+import scala.io.Source
+
+import BuildUtils._
 
 val DEFAULT_SCALA_VERSION = "2.12.20"
 val DEFAULT_SPARK_VERSION = "3.5.4"
-val DEFAULT_JDK_VERSION = "11"
-
-organization := "org.apache.datasketches"
-description := "The Apache DataSketches package for Spark"
-
-licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0";))
-
-scalaVersion := sys.env.getOrElse("SCALA_VERSION", DEFAULT_SCALA_VERSION)
-
-val sparkVersion = settingKey[String]("The version of Spark")
-sparkVersion := sys.env.getOrElse("SPARK_VERSION", DEFAULT_SPARK_VERSION)
-
-// determine our java version
-val jvmVersionString = settingKey[String]("The JVM version")
-jvmVersionString := sys.props("java.version")
+val DEFAULT_JDK_VERSION = "17"
 
 // Map of JVM version prefix to:
 // (JVM major version, datasketches-java version)
+// TODO: consider moving to external file
 val jvmVersionMap = Map(
   "21" -> ("21", "8.0.0"),
   "17" -> ("17", "7.0.1"),
@@ -47,56 +35,89 @@ val jvmVersionMap = Map(
   "1.8" -> ("8", "6.2.0")
 )
 
-// determine the JVM major verison (default: 11)
+// version processing logic
+val scalaVersion = settingKey[String]("The version of Scala")
+val scalaVersionValue = sys.env.getOrElse("SCALA_VERSION", 
DEFAULT_SCALA_VERSION)
+
+val sparkVersion = settingKey[String]("The version of Spark")
+val sparkVersionValue = sys.env.getOrElse("SPARK_VERSION", 
DEFAULT_SPARK_VERSION)
+
+val jvmFullVersion = settingKey[String]("The JVM version")
+val jvmFullVersionValue = sys.props("java.version")
+
 val jvmVersion = settingKey[String]("The JVM major version")
-jvmVersion := jvmVersionMap.collectFirst {
-  case (prefix, (major, _)) if jvmVersionString.value.startsWith(prefix) => 
major
+val jvmVersionValue = jvmVersionMap.collectFirst {
+  case (prefix, (major, _)) if jvmFullVersionValue.startsWith(prefix) => major
 }.getOrElse(DEFAULT_JDK_VERSION)
 
 // look up the associated datasketches-java version
 val dsJavaVersion = settingKey[String]("The DataSketches Java version")
-dsJavaVersion := jvmVersionMap.get(jvmVersion.value).map(_._2).get
-
-
-javacOptions ++= Seq("-source", jvmVersion.value, "-target", jvmVersion.value)
-scalacOptions ++= Seq("-encoding", "UTF-8", "-release", jvmVersion.value)
-Test / javacOptions ++= Seq("-source", jvmVersion.value, "-target", 
jvmVersion.value)
-Test / scalacOptions ++= Seq("-encoding", "UTF-8", "-release", 
jvmVersion.value)
-
-libraryDependencies ++= Seq(
-  "org.apache.datasketches" % "datasketches-java" % dsJavaVersion.value % 
"compile",
-  "org.scala-lang" % "scala-library" % scalaVersion.value, // scala3-library 
may need to use %%
-  ("org.apache.spark" %% "spark-sql" % sparkVersion.value % 
"provided").cross(CrossVersion.for3Use2_13),
-  "org.scalatest" %% "scalatest" % "3.2.19" % "test",
-  "org.scalatestplus" %% "junit-4-13" % "3.2.19.0" % "test"
-)
-
-Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD")
-
-// additional options for java 17
-Test / fork := {
-  if (jvmVersion.value == "17") true
-  else (Test / fork).value
-}
-
-Test / javaOptions ++= {
-  if (jvmVersion.value == "17") {
-    Seq("--add-modules=jdk.incubator.foreign",
-        "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED"
-    )
-  } else {
-    Seq.empty
-  }
-}
-
-scalacOptions ++= Seq(
-  "-deprecation",
-  "-feature",
-  "-unchecked",
-  "-Xlint"
-)
-
-Test / logBuffered := false
-
-// Level.INFO is needed to see detailed output when running tests
-Test / logLevel := Level.Info
+val dsJavaVersionValue = 
jvmVersionMap.get(jvmVersionValue).map(_._2).getOrElse("ERROR")
+
+lazy val copyDatasketchesDependencies = taskKey[Seq[File]]("Copy dependencies 
to a known location")
+
+lazy val root = (project in file("."))
+  .settings(
+    name := "datasketches-spark",
+    version := readVersion("version.cfg.in"),
+    organization := "org.apache.datasketches",
+    description := "The Apache DataSketches package for Spark",
+    licenses += ("Apache-2.0", 
url("http://www.apache.org/licenses/LICENSE-2.0";)),
+    homepage := Some(url("https://datasketches.apache.org/";)),
+    jvmVersion := jvmVersionValue,
+    dsJavaVersion := dsJavaVersionValue,
+    sparkVersion := sparkVersionValue,
+    scalaVersion := scalaVersionValue,
+    javacOptions ++= Seq("-source", jvmVersion.value, "-target", 
jvmVersion.value),
+    scalacOptions ++= Seq("-encoding", "UTF-8", "-release", jvmVersion.value),
+    Test / javacOptions ++= Seq("-source", jvmVersion.value, "-target", 
jvmVersion.value),
+    Test / scalacOptions ++= Seq(
+      "-encoding", "UTF-8",
+      "-release", jvmVersion.value,
+      "-deprecation",
+      "-feature",
+      "-unchecked",
+      "-Xlint"
+    ),
+    libraryDependencies ++= Seq(
+      "org.apache.datasketches" % "datasketches-java" % dsJavaVersion.value,
+      "org.scala-lang" % "scala-library" % scalaVersion.value % "provided", // 
scala3-library may need to use %%
+      ("org.apache.spark" %% "spark-sql" % sparkVersion.value % 
"provided").cross(CrossVersion.for3Use2_13),
+      "org.scalatest" %% "scalatest" % "3.2.19" % "test",
+      "org.scalatestplus" %% "junit-4-13" % "3.2.19.0" % "test"
+    ),
+    copyDatasketchesDependencies := {
+      // we want to copy non-provided/non-test dependencies to a known location
+      // so that they can be obtained easily
+      val targetLibDir = target.value / "lib"
+      IO.createDirectory(targetLibDir)
+      val dependencyJars = (Compile / dependencyClasspath).value.collect {
+        case attr if (attr.data.getName.startsWith("datasketches-java") || 
attr.data.getName.startsWith("datasketches-memory"))
+                      && attr.data.getName.endsWith(".jar") =>
+          val file = attr.data
+          val targetFile = targetLibDir / file.getName
+          IO.copyFile(file, targetFile)
+          targetFile
+      }
+      dependencyJars
+    },
+    Compile / packageBin := (Compile / 
packageBin).dependsOn(copyDatasketchesDependencies).value,
+    Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oD"),
+    // additional options for java 17
+    Test / fork := {
+      if (jvmVersion.value == "17") true
+      else (Test / fork).value
+    },
+    Test / javaOptions ++= {
+      if (jvmVersion.value == "17") {
+        Seq("--add-modules=jdk.incubator.foreign",
+            "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED"
+        )
+      } else {
+        Seq.empty
+      }
+    },
+    Test / logBuffered := false,
+    // Level.INFO is needed to see detailed output when running tests
+    Test / logLevel := Level.Info
+  )
diff --git a/project/BuildUtils.scala b/project/BuildUtils.scala
new file mode 100644
index 0000000..40e22e6
--- /dev/null
+++ b/project/BuildUtils.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import scala.io.Source
+
+object BuildUtils {
+
+def generateVersionCfg(filename: String): Unit = {
+  val version = readVersion(filename + ".in")
+  val writer = new java.io.PrintWriter(filename) {
+    write(version)
+    close()
+  }
+}
+
+def readVersion(filename: String): String = {
+  // TODO: only generate version.cfg if version.cfg.in is newer
+  val bufferedSource = Source.fromFile(filename)
+  try {
+    bufferedSource.getLines.find(line => !line.trim.startsWith("#")).get
+  } finally {
+    bufferedSource.close()
+  }
+}
+
+}
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 0000000..fb0c1bf
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# build/config files
+include ../version.cfg.in
+
+# content files
+graft src
+graft test
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..7e7f2b9
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,24 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+# Apache<sup>&reg;</sup> DataSketches&trade; PySpark Library
+
+This repo is still an early-stage work in progress.
+
+This is the PySpark plugin component.
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..ddb17b1
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#import importlib.util
+import glob
+import os
+import sys
+#import ctypes
+from setuptools import setup, find_packages
+#from setuptools.command.install import install
+from shutil import copyfile #, copytree, rmtree
+
+DS_SPARK_HOME = os.environ.get("DS_SPARK_HOME", os.path.abspath("../"))
+with open(f'{DS_SPARK_HOME}/version.cfg.in', 'r') as file:
+    VERSION = file.read().rstrip()
+TEMP_PATH = "src/datasketches_spark/deps" # we can store the relevant jars in 
here
+
+# An error message if trying to run this without first building the jars
+missing_jars_message = """
+If you are trying to install the datasketches_spark Python package
+from source, you need to first build the jars.
+
+To build the jars, run the following command from the root directory of
+the repository:
+    sbt clean package
+Next, you can return to this diretory and resume.
+"""
+
+# Find the datasketches-spark jar path -- other dependencies handled separately
+DS_SPARK_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/scala-*/"))
+if len(DS_SPARK_JAR_PATH) == 1:
+    DS_SPARK_JAR_PATH = DS_SPARK_JAR_PATH[0]
+elif len(DS_SPARK_JAR_PATH) > 1:
+    print(
+        "Found jars for multiple scala versions ({0}). Please clean up the 
target directory".format(
+            DS_SPARK_JAR_PATH
+        ),
+        file=sys.stderr
+    )
+    sys.exit(-1)
+elif len(DS_SPARK_JAR_PATH) == 0: # core spark also checks for TEMP_PATH -- 
unclear why?
+    print(missing_jars_message, file=sys.stderr)
+    sys.exit(-1)
+
+# Find the datasketches-java and datasketches-memory dependency jar path
+DS_JAVA_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
+if len(DS_JAVA_JAR_PATH) == 1:
+    DS_JAVA_JAR_PATH = DS_JAVA_JAR_PATH[0]
+else: # error if something other than 1 directory found
+    print(missing_jars_message, file=sys.stderr)
+    sys.exit(-1)
+
+# Copy the jars to the temporary directory
+# Future possible enhancement: symlink instead of copy
+try:
+    os.makedirs(TEMP_PATH)
+except OSError:
+    # we don't care if it already exists
+    pass
+
+# Copy the relevant jar files to temp path
+for jar_file in glob.glob(os.path.join(DS_SPARK_JAR_PATH, 
f"datasketches-spark_*-{VERSION}.jar")):
+    copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH, 
f"datasketches-java-*.jar")):
+    copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+for jar_file in glob.glob(os.path.join(DS_JAVA_JAR_PATH, 
f"datasketches-memory-*.jar")):
+    copyfile(jar_file, os.path.join(TEMP_PATH, os.path.basename(jar_file)))
+
+setup(
+    name='datasketches_spark',
+    version=VERSION,
+    author='Apache Software Foundation',
+    author_email='[email protected]',
+    description='The Apache DataSketches Library for Python',
+    license='Apache License 2.0',
+    url='http://datasketches.apache.org',
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    include_package_data=True,
+    package_dir={'':'src'},
+    packages=find_packages(where='src'),
+    install_requires=['pyspark'],
+    python_requires='>=3.8',
+    zip_safe=False
+)
diff --git a/python/datasketches_spark/__init__.py 
b/python/src/datasketches_spark/__init__.py
similarity index 92%
rename from python/datasketches_spark/__init__.py
rename to python/src/datasketches_spark/__init__.py
index ef9effb..c51bb17 100644
--- a/python/datasketches_spark/__init__.py
+++ b/python/src/datasketches_spark/__init__.py
@@ -18,5 +18,5 @@ Provided under the Apache License, Version 2.0
 name = 'datasketches_spark'
 
 from .common import *
-from .common import _invoke_function_over_columns
+#from .common import _invoke_function_over_columns
 from .kll import *
diff --git a/python/datasketches_spark/common.py 
b/python/src/datasketches_spark/common.py
similarity index 79%
rename from python/datasketches_spark/common.py
rename to python/src/datasketches_spark/common.py
index aba63c3..93be8f2 100644
--- a/python/datasketches_spark/common.py
+++ b/python/src/datasketches_spark/common.py
@@ -15,9 +15,32 @@ from py4j.java_gateway import JavaClass
 from typing import Any, TypeVar, Union, Callable
 from functools import lru_cache
 
+import os
+import pkg_resources
+
 ColumnOrName = Union[Column, str]
 ColumnOrName_ = TypeVar("ColumnOrName_", bound=ColumnOrName)
 
+def get_jar_paths(*jar_names: str) -> list[str]:
+    """
+    Returns a list of absolute paths to the provided jars,\n
+    assuming they are included in the package.
+    :param jar_names: Names of jars to retrieve
+    :return: List of absolute paths to jars
+    """
+    jar_paths = []
+    for jar_name in jar_names:
+        try:
+            jar_path = pkg_resources.resource_filename(__name__, 
f"deps/{jar_name}")
+            if os.path.exists(jar_path):
+                jar_paths.append(jar_path)
+            else:
+                raise FileNotFoundError(f"Jar {jar_name} not found in package")
+        except ValueError:
+            raise FileNotFoundError(f"Jar {jar_name} not found in package")
+    return jar_paths
+
+
 # Since we have functions from different packages, rather than the
 # single 16k+ line functions class in core Spark, we'll have each
 # sketch family grab its own functions class from the JVM and cache it
diff --git a/python/datasketches_spark/kll.py 
b/python/src/datasketches_spark/kll.py
similarity index 100%
rename from python/datasketches_spark/kll.py
rename to python/src/datasketches_spark/kll.py
diff --git a/version.cfg.in b/version.cfg.in
new file mode 100644
index 0000000..1821d33
--- /dev/null
+++ b/version.cfg.in
@@ -0,0 +1 @@
+0.1.0.dev0


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to