From 729635db9a627591901b7f5fcd994a8acd49b2a0 Mon Sep 17 00:00:00 2001 From: Serge Rielau Date: Mon, 11 May 2026 10:32:49 -0700 Subject: [PATCH 1/2] [SPARK-56827][INFRA] Clean stale genjavadoc output before JavaUnidoc Stale trees under **/target/java (e.g. partial builds or Test/doc) were aggregated by JavaUnidoc and produced bogus reference errors, failing the Documentation job. Add cleanGenjavadocOutput (spark project) to remove those directories, and invoke it before unidoc in docs/_plugins/build_api_docs.rb. Developers can run build/sbt cleanGenjavadocOutput unidoc locally if needed. --- docs/_plugins/build_api_docs.rb | 4 ++- project/SparkBuild.scala | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/docs/_plugins/build_api_docs.rb b/docs/_plugins/build_api_docs.rb index 1ef80bfaf09a4..3072e2ddf85d0 100644 --- a/docs/_plugins/build_api_docs.rb +++ b/docs/_plugins/build_api_docs.rb @@ -129,7 +129,9 @@ def build_spark_scala_and_java_docs_if_necessary return end - command = "build/sbt -Pkinesis-asl unidoc" + # Drop stale genjavadoc trees (target/java) before unifying Javadoc; leftover stubs + # (e.g. from a partial or Test/doc run) can make JavaUnidoc fail (SPARK-56827). + command = "build/sbt -Pkinesis-asl cleanGenjavadocOutput unidoc" puts "Running '#{command}'..." # Two filter passes on the unidoc output: diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 66947a58ac590..ee656263b1d3f 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1605,6 +1605,55 @@ object Unidoc { import sbtunidoc.JavaUnidocPlugin.autoImport._ import sbtunidoc.ScalaUnidocPlugin.autoImport._ + val cleanGenjavadocOutput = taskKey[Unit]( + "Delete each module's target/java tree (genjavadoc stubs) before unified API docs " + + "so JavaUnidoc cannot pick up stale files (SPARK-56827)." + ) + + /** + * Collect `target/java` directories under the given root. Skips large or irrelevant + * trees (VCS, venvs, generated test data) to keep the walk cheap. + */ + private def genjavadocJavaOutputDirs(root: File): Seq[File] = { + val skipNames = Set( + ".bloop", + ".git", + ".github", + ".idea", + ".metals", + ".venv", + ".vscode", + "bin", + "build", + "dist", + "docs", + "metastore_db", + "node_modules", + "out", + "python", + "R", + "target", + "work" + ) + + def walk(dir: File): Seq[File] = { + val name = dir.getName + if (skipNames.contains(name) || name.startsWith("tpcds-")) { + Nil + } else { + val mine = { + val tj = dir / "target" / "java" + if (tj.isDirectory) Seq(tj) else Nil + } + val children = Option(dir.listFiles()).map(_.toIndexedSeq).getOrElse(Nil) + val subDirs = children.filter(_.isDirectory).flatMap(walk) + mine ++ subDirs + } + } + + walk(root) + } + protected def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { packages .map(_.filterNot(_.getName.contains("$"))) @@ -1729,6 +1778,11 @@ object Unidoc { inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, kubernetes, yarn, tags, streamingKafka010, sqlKafka010, connectCommon, connect, connectJdbc, connectClient, connectShims, protobuf, profiler, udfWorkerProto, udfWorkerCore), + + cleanGenjavadocOutput := { + IO.delete(genjavadocJavaOutputDirs((ThisBuild / baseDirectory).value)) + () + } ) } From 9fb4d77c46df40608caa213f7e9d771ac14db9c4 Mon Sep 17 00:00:00 2001 From: Serge Rielau Date: Mon, 11 May 2026 16:09:25 -0700 Subject: [PATCH 2/2] [SPARK-56827][INFRA] Use target.all(ScopeFilter) for cleanGenjavadocOutput Replace hand-rolled repo walk with sbt's per-project target enumeration (review feedback on PR #55801). --- project/SparkBuild.scala | 50 ++++------------------------------------ 1 file changed, 4 insertions(+), 46 deletions(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index ee656263b1d3f..f471b186e45b0 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1610,50 +1610,6 @@ object Unidoc { "so JavaUnidoc cannot pick up stale files (SPARK-56827)." ) - /** - * Collect `target/java` directories under the given root. Skips large or irrelevant - * trees (VCS, venvs, generated test data) to keep the walk cheap. - */ - private def genjavadocJavaOutputDirs(root: File): Seq[File] = { - val skipNames = Set( - ".bloop", - ".git", - ".github", - ".idea", - ".metals", - ".venv", - ".vscode", - "bin", - "build", - "dist", - "docs", - "metastore_db", - "node_modules", - "out", - "python", - "R", - "target", - "work" - ) - - def walk(dir: File): Seq[File] = { - val name = dir.getName - if (skipNames.contains(name) || name.startsWith("tpcds-")) { - Nil - } else { - val mine = { - val tj = dir / "target" / "java" - if (tj.isDirectory) Seq(tj) else Nil - } - val children = Option(dir.listFiles()).map(_.toIndexedSeq).getOrElse(Nil) - val subDirs = children.filter(_.isDirectory).flatMap(walk) - mine ++ subDirs - } - } - - walk(root) - } - protected def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { packages .map(_.filterNot(_.getName.contains("$"))) @@ -1780,8 +1736,10 @@ object Unidoc { connectClient, connectShims, protobuf, profiler, udfWorkerProto, udfWorkerCore), cleanGenjavadocOutput := { - IO.delete(genjavadocJavaOutputDirs((ThisBuild / baseDirectory).value)) - () + val dirs = target.all(ScopeFilter(inAnyProject)).value + .map(_ / "java") + .filter(_.isDirectory) + IO.delete(dirs) } ) }