From 180fa0d85bc3dcc3eedf584994d4e94784ee896a Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Mon, 27 Oct 2025 12:24:30 -0700 Subject: [PATCH 1/4] build --- build.sbt | 16 +++++++++------- docs-site/docs/docs/installation.md | 8 +++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/build.sbt b/build.sbt index 0c86fb858..4ecbed4d1 100644 --- a/build.sbt +++ b/build.sbt @@ -218,9 +218,11 @@ val `polynote-server` = project.settings( Test / testOptions += Tests.Argument("-oF") ).dependsOn(`polynote-runtime` % "provided", `polynote-runtime` % "test", `polynote-kernel` % "provided", `polynote-kernel` % "test->test") +// Supported Spark versions for each Scala binary version +// The default version (used if SPARK_VERSION env var is not set) is the last one in each list val sparkVersions = Map( - "2.12" -> "3.1.2", - "2.13" -> "3.2.1" + "2.12" -> Seq("3.3.4", "3.5.7"), + "2.13" -> Seq("3.3.4", "3.5.7") ) // keep expected checksums here. This has two benefits over checking the sha512sum from the archive: @@ -229,8 +231,8 @@ val sparkVersions = Map( // See https://issues.apache.org/jira/browse/SPARK-30683 // To add to this list, download the tarball for the new version from the apache repo and run `sha512sum .tgz` val sparkChecksums = Map( - "3.1.2" -> "ba47e074b2a641b23ee900d4e28260baa250e2410859d481b38f2ead888c30daea3683f505608870148cf40f76c357222a2773f1471e7342c622e93bf02479b7", - "3.2.1" -> "2ec9f1cb65af5ee7657ca83a1abaca805612b8b3a1d8d9bb67e317106025c81ba8d44d82ad6fdb45bbe6caa768d449cd6a4945ec050ce9390f806f46c5cb1397" + "3.3.4" -> "a3874e340a113e95898edfa145518648700f799ffe2d1ce5dde7743e88fdf5559d79d9bcb1698fdfa5296a63c1d0fc4c8e32a93529ed58cd5dcf0721502a1fc7", + "3.5.7" -> "f3b7d5974d746b9aaecb19104473da91068b698a4d292177deb75deb83ef9dc7eb77062446940561ac9ab7ee3336fb421332b1c877292dab4ac1b6ca30f4f2e0" ) val sparkDistUrl: String => String = @@ -240,7 +242,7 @@ val sparkSettings = Seq( resolvers ++= { Seq(MavenRepository(name = "Apache Staging", root = "https://repository.apache.org/content/repositories/staging")) }, - sparkVersion := sparkVersions(scalaBinaryVersion.value), + sparkVersion := sys.env.getOrElse("SPARK_VERSION", sparkVersions(scalaBinaryVersion.value).last), libraryDependencies ++= Seq( "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", "org.apache.spark" %% "spark-repl" % sparkVersion.value % "provided", @@ -252,13 +254,13 @@ val sparkSettings = Seq( .getOrElse((file(".").getAbsoluteFile / "target" / "spark").getCanonicalPath) }, sparkHome := { - (file(sparkInstallLocation.value) / s"spark-${sparkVersion.value}-bin-hadoop2.7").toString + (file(sparkInstallLocation.value) / s"spark-${sparkVersion.value}-bin-hadoop3").toString }, Test / testOptions += Tests.Setup { () => import sys.process._ val baseDir = file(sparkInstallLocation.value) val distVersion = sparkVersion.value - val pkgName = if (scalaBinaryVersion.value == "2.13") s"spark-$distVersion-bin-hadoop2.7-scala2.13" else s"spark-$distVersion-bin-hadoop2.7" + val pkgName = if (scalaBinaryVersion.value == "2.13") s"spark-$distVersion-bin-hadoop3-scala2.13" else s"spark-$distVersion-bin-hadoop3" val filename = s"$pkgName.tgz" val distUrl = url(s"${sparkDistUrl(distVersion)}/$filename") val destDir = baseDir / pkgName diff --git a/docs-site/docs/docs/installation.md b/docs-site/docs/docs/installation.md index 8a9eb325a..ae561a51d 100644 --- a/docs-site/docs/docs/installation.md +++ b/docs-site/docs/docs/installation.md @@ -58,10 +58,12 @@ environment. !!! tip "Using Spark with Polynote" If you'll be using Spark with Polynote, please make sure you read this [note about Spark and Polynote](basic-usage.md#using-spark-with-polynote) for more information. - - Currently, Polynote supports both **Spark 2.1** (with Scala 2.11) and **2.4** (with Scala 2.11 and 2.12). - _Some users have had success running Spark 3.0 with Scala 2.12. Please see [this issue](https://github.com/polynote/polynote/issues/926) for more information_ + Currently, Polynote supports the following Spark versions: + - **Scala 2.12**: Spark 3.3.4, 3.5.7 (default: 3.5.7) + - **Scala 2.13**: Spark 3.3.4, 3.5.7 (default: 3.5.7) + + You can override the default Spark version by setting the `SPARK_VERSION` environment variable when building. Polynote will use the `spark-submit` command in order to start Spark kernels. From e3739fa6d2ad3be131f6908a42659d4ae2fcd582 Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Mon, 27 Oct 2025 12:27:13 -0700 Subject: [PATCH 2/4] dist --- .github/workflows/dist.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml index f9600a567..b05c43b63 100644 --- a/.github/workflows/dist.yml +++ b/.github/workflows/dist.yml @@ -35,16 +35,17 @@ jobs: run: | echo "${DOCKER_PASSWORD}" | docker login -u ${DOCKER_USERNAME} --password-stdin export POLYNOTE_VERSION=${GITHUB_REF#refs/tags/} - export SPARK_VERSION="2.4.5" export SCALA_VERSION="2.12" docker build --build-arg POLYNOTE_VERSION --build-arg SCALA_VERSION -t polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION} docker/base/ docker push polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION} - docker build --build-arg POLYNOTE_VERSION --build-arg SCALA_VERSION -t polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark2.4 docker/spark - docker push polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark2.4 - export SPARK_VERSION="3.1.2" - docker build --build-arg POLYNOTE_VERSION --build-arg SCALA_VERSION -t polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.1 docker/spark - docker push polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.1 - echo "Setting latest tag to ${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.1" + export SPARK_VERSION="3.3.4" + docker build --build-arg POLYNOTE_VERSION --build-arg SCALA_VERSION -t polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.3 docker/spark + docker push polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.3 + export SPARK_VERSION="3.5.7" + docker build --build-arg POLYNOTE_VERSION --build-arg SCALA_VERSION -t polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.5 docker/spark + docker push polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.5 + echo "Setting latest tag to ${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.5" + docker tag polynote/polynote:${POLYNOTE_VERSION}-${SCALA_VERSION}-spark3.5 polynote/polynote:latest docker push polynote/polynote:latest env: DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} From b04417606cce96093a02c503f56d1036dbdf2f22 Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Mon, 27 Oct 2025 12:31:17 -0700 Subject: [PATCH 3/4] test both spark versions --- .github/workflows/ci-backend-2.12.yml | 6 +++++- .github/workflows/ci-backend-2.13.yml | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-backend-2.12.yml b/.github/workflows/ci-backend-2.12.yml index 6451e1c92..3a7e294fa 100644 --- a/.github/workflows/ci-backend-2.12.yml +++ b/.github/workflows/ci-backend-2.12.yml @@ -21,7 +21,7 @@ jobs: with: java-version: 8 architecture: x64 - - name: Build + - name: Build and Test run: | echo "Setting up Python dependencies" pip install -r ./requirements.txt @@ -32,5 +32,9 @@ jobs: export LD_PRELOAD=${jep_lib_path}/libpython3.so pushd $GITHUB_WORKSPACE + echo "Testing with Spark 3.3.4" + SPARK_VERSION=3.3.4 sbt 'set scalaVersion := "2.12.12"' test + + echo "Testing with Spark 3.5.7 (default)" sbt 'set scalaVersion := "2.12.12"' test popd diff --git a/.github/workflows/ci-backend-2.13.yml b/.github/workflows/ci-backend-2.13.yml index 736610bce..4173c76eb 100644 --- a/.github/workflows/ci-backend-2.13.yml +++ b/.github/workflows/ci-backend-2.13.yml @@ -21,7 +21,7 @@ jobs: with: java-version: 8 architecture: x64 - - name: Build + - name: Build and Test run: | echo "Setting up Python dependencies" pip install -r ./requirements.txt @@ -32,5 +32,9 @@ jobs: export LD_PRELOAD=${jep_lib_path}/libpython3.so pushd $GITHUB_WORKSPACE + echo "Testing with Spark 3.3.4" + SPARK_VERSION=3.3.4 sbt 'set scalaVersion := "2.13.6"' test + + echo "Testing with Spark 3.5.7 (default)" sbt 'set scalaVersion := "2.13.6"' test popd From 463d1c1e9765083ab51525f4f931bf68de4eab09 Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Mon, 27 Oct 2025 15:02:46 -0700 Subject: [PATCH 4/4] 3.3+ sql listerner to visitor --- .../polynote/kernel/interpreter/sql/Parser.scala | 13 ++++++++++--- .../interpreter/sql/SparkSqlInterpreter.scala | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/Parser.scala b/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/Parser.scala index 06d4523cb..faa229317 100644 --- a/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/Parser.scala +++ b/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/Parser.scala @@ -32,16 +32,23 @@ class Parser { val tableIdentifiers = ListBuffer[Parser.TableIdentifier]() - parser.addParseListener(new SqlBaseBaseListener { - override def exitTableIdentifier(ctx: SqlBaseParser.TableIdentifierContext): Unit = { + // Visitor to extract table identifiers + val tableIdentifierVisitor = new SqlBaseParserBaseVisitor[Unit] { + override def visitTableIdentifier(ctx: SqlBaseParser.TableIdentifierContext): Unit = { val db = Option(ctx.db).map(_.getText).filter(_.nonEmpty) val name = Option(ctx.table).map(_.getText).getOrElse("") tableIdentifiers += Parser.TableIdentifier(db, name) + super.visitTableIdentifier(ctx) } - }) + + override def defaultResult(): Unit = () + override def aggregateResult(aggregate: Unit, nextResult: Unit): Unit = () + } try { val statement = parser.singleStatement() + // Visit the parse tree to extract table identifiers + tableIdentifierVisitor.visit(statement) val result = Parser.Result(statement, tableIdentifiers.toList) if (errors.nonEmpty) { Ior.both(CompileErrors(errors.toList), result) diff --git a/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/SparkSqlInterpreter.scala b/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/SparkSqlInterpreter.scala index 82e45e4bd..b8163a2ff 100644 --- a/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/SparkSqlInterpreter.scala +++ b/polynote-spark/src/main/scala/polynote/kernel/interpreter/sql/SparkSqlInterpreter.scala @@ -1,7 +1,7 @@ package polynote.kernel.interpreter.sql import org.apache.spark.sql.catalyst.parser.SqlBaseParser.SingleStatementContext -import org.apache.spark.sql.catalyst.parser.{SqlBaseBaseVisitor, SqlBaseParser} +import org.apache.spark.sql.catalyst.parser.{SqlBaseParserBaseVisitor, SqlBaseParser} import org.apache.spark.sql.thief.SessionStateThief import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import polynote.kernel.environment.CurrentNotebook @@ -92,7 +92,7 @@ class SparkSqlInterpreter(compiler: ScalaCompiler) extends Interpreter { def shutdown(): Task[Unit] = ZIO.unit - private class CompletionVisitor(pos: Int, availableSymbols: mutable.TreeSet[String]) extends SqlBaseBaseVisitor[List[Completion]] { + private class CompletionVisitor(pos: Int, availableSymbols: mutable.TreeSet[String]) extends SqlBaseParserBaseVisitor[List[Completion]] { override def defaultResult(): List[Completion] = Nil override def aggregateResult(aggregate: List[Completion], nextResult: List[Completion]): List[Completion] = aggregate ++ nextResult