Add JavaAPICompletenessChecker.

This is used to find methods in the Scala API that need to be ported to the Java API. To use it: ./run spark.tools.JavaAPICompletenessChecker Conflicts: project/SparkBuild.scala run run2.cmd

Add JavaAPICompletenessChecker.
c8368043 · Josh Rosen · Matei Zaharia · 8e38e772 · c8368043 · c8368043
Commit c8368043 authored 11 years ago by Josh Rosen Committed by Matei Zaharia 11 years ago
--- a/bin/compute-classpath.cmd
+++ b/bin/compute-classpath.cmd
@@ -33,6 +33,7 @@ set REPL_DIR=%FWDIR%repl
 set EXAMPLES_DIR=%FWDIR%examples
 set BAGEL_DIR=%FWDIR%bagel
 set MLLIB_DIR=%FWDIR%mllib
+set TOOLS_DIR=%FWDIR%tools
 set STREAMING_DIR=%FWDIR%streaming
 set PYSPARK_DIR=%FWDIR%python
@@ -48,6 +49,7 @@ set CLASSPATH=%CLASSPATH%;%FWDIR%repl\lib\*
 set CLASSPATH=%CLASSPATH%;%FWDIR%python\lib\*
 set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes
 set CLASSPATH=%CLASSPATH%;%MLLIB_DIR%\target\scala-%SCALA_VERSION%\classes
+set CLASSPATH=%CLASSPATH%;%TOOLS_DIR%\target\scala-%SCALA_VERSION%\classes
 rem Add hadoop conf dir - else FileSystem.*, etc fail
 rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts

--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -36,6 +36,7 @@ REPL_BIN_DIR="$FWDIR/repl-bin"
 EXAMPLES_DIR="$FWDIR/examples"
 BAGEL_DIR="$FWDIR/bagel"
 MLLIB_DIR="$FWDIR/mllib"
+TOOLS_DIR="$FWDIR/tools"
 STREAMING_DIR="$FWDIR/streaming"
 PYSPARK_DIR="$FWDIR/python"
@@ -70,6 +71,7 @@ function dev_classpath {
  fi
  CLASSPATH="$CLASSPATH:$BAGEL_DIR/target/scala-$SCALA_VERSION/classes"
  CLASSPATH="$CLASSPATH:$MLLIB_DIR/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$TOOLS_DIR/target/scala-$SCALA_VERSION/classes"
  for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do
    CLASSPATH="$CLASSPATH:$jar"
  done

--- a/pom.xml
+++ b/pom.xml
@@ -58,6 +58,7 @@
    <module>core</module>
    <module>bagel</module>
    <module>examples</module>
+    <module>tools</module>
    <module>streaming</module>
    <module>repl</module>
    <module>repl-bin</module>

--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -40,7 +40,7 @@ object SparkBuild extends Build {
  //val HADOOP_MAJOR_VERSION = "2"
  //val HADOOP_YARN = true
-  lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, mllib)
+  lazy val root = Project("root", file("."), settings = rootSettings) aggregate(core, repl, examples, bagel, streaming, mllib, tools)
  lazy val core = Project("core", file("core"), settings = coreSettings)
@@ -48,6 +48,8 @@ object SparkBuild extends Build {
  lazy val examples = Project("examples", file("examples"), settings = examplesSettings) dependsOn (core) dependsOn (streaming)
+  lazy val tools = Project("tools", file("tools"), settings = examplesSettings) dependsOn (core) dependsOn (streaming)
  lazy val bagel = Project("bagel", file("bagel"), settings = bagelSettings) dependsOn (core)
  lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn (core)
@@ -233,6 +235,10 @@ object SparkBuild extends Build {
    )
  )
+  def toolsSettings = sharedSettings ++ Seq(
+    name := "spark-tools"
+  )
  def bagelSettings = sharedSettings ++ Seq(name := "spark-bagel")
  def mllibSettings = sharedSettings ++ Seq(

--- a/tools/pom.xml
+++ b/tools/pom.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.spark-project</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>0.7.4-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <groupId>org.spark-project</groupId>
+  <artifactId>spark-tools_2.9.3</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Tools</name>
+  <url>http://spark-project.org/</url>
+  <dependencies>
+    <dependency>
+      <groupId>org.spark-project</groupId>
+      <artifactId>spark-core_2.9.3</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.spark-project</groupId>
+      <artifactId>spark-streaming_2.9.3</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>
--- a/tools/src/main/scala/spark/tools/JavaAPICompletenessChecker.scala
+++ b/tools/src/main/scala/spark/tools/JavaAPICompletenessChecker.scala
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package spark.tools
+import spark._
+import java.lang.reflect.Method
+import scala.collection.mutable.ArrayBuffer
+import spark.api.java._
+import spark.streaming.{PairDStreamFunctions, DStream, StreamingContext}
+import spark.streaming.api.java.{JavaPairDStream, JavaDStream, JavaStreamingContext}
+import scala.Tuple2
+private[spark] abstract class SparkType(val name: String)
+private[spark] case class BaseType(override val name: String) extends SparkType(name) {
+  override def toString: String = {
+    name
+  }
+}
+private[spark]
+case class ParameterizedType(override val name: String,
+                             parameters: Seq[SparkType],
+                             typebounds: String = "") extends SparkType(name) {
+  override def toString: String = {
+    if (typebounds != "") {
+      typebounds + " " + name + "<" + parameters.mkString(", ") + ">"
+    } else {
+      name + "<" + parameters.mkString(", ") + ">"
+    }
+  }
+}
+private[spark]
+case class SparkMethod(name: String, returnType: SparkType, parameters: Seq[SparkType]) {
+  override def toString: String = {
+    returnType + " " + name + "(" + parameters.mkString(", ") + ")"
+  }
+}
+/**
+ * A tool for identifying methods that need to be ported from Scala to the Java API.
+ *
+ * It uses reflection to find methods in the Scala API and rewrites those methods' signatures
+ * into appropriate Java equivalents.  If those equivalent methods have not been implemented in
+ * the Java API, they are printed.
+ */
+object JavaAPICompletenessChecker {
+  private def parseType(typeStr: String): SparkType = {
+    if (!typeStr.contains("<")) {
+      // Base types might begin with "class" or "interface", so we have to strip that off:
+      BaseType(typeStr.trim.split(" ").last)
+    } else if (typeStr.endsWith("[]")) {
+      ParameterizedType("Array", Seq(parseType(typeStr.stripSuffix("[]"))))
+    } else {
+      val parts = typeStr.split("<", 2)
+      val name = parts(0).trim
+      assert (parts(1).last == '>')
+      val parameters = parts(1).dropRight(1)
+      ParameterizedType(name, parseTypeList(parameters))
+    }
+  }
+  private def parseTypeList(typeStr: String): Seq[SparkType] = {
+    val types: ArrayBuffer[SparkType] = new ArrayBuffer[SparkType]
+    var stack = 0
+    var token: StringBuffer = new StringBuffer()
+    for (c <- typeStr.trim) {
+      if (c == ',' && stack == 0) {
+        types += parseType(token.toString)
+        token = new StringBuffer()
+      } else if (c == ' ' && stack != 0) {
+        // continue
+      } else {
+        if (c == '<') {
+          stack += 1
+        } else if (c == '>') {
+          stack -= 1
+        }
+        token.append(c)
+      }
+    }
+    assert (stack == 0)
+    if (token.toString != "") {
+      types += parseType(token.toString)
+    }
+    types.toSeq
+  }
+  private def parseReturnType(typeStr: String): SparkType = {
+    if (typeStr(0) == '<') {
+      val parts = typeStr.drop(0).split(">", 2)
+      val parsed = parseType(parts(1)).asInstanceOf[ParameterizedType]
+      ParameterizedType(parsed.name, parsed.parameters, parts(0))
+    } else {
+      parseType(typeStr)
+    }
+  }
+  private def toSparkMethod(method: Method): SparkMethod = {
+    val returnType = parseReturnType(method.getGenericReturnType.toString)
+    val name = method.getName
+    val parameters = method.getGenericParameterTypes.map(t => parseType(t.toString))
+    SparkMethod(name, returnType, parameters)
+  }
+  private def toJavaType(scalaType: SparkType): SparkType = {
+    val renameSubstitutions = Map(
+      "scala.collection.Map" -> "java.util.Map",
+      // TODO: the JavaStreamingContext API accepts Array arguments
+      // instead of Lists, so this isn't a trivial translation / sub:
+      "scala.collection.Seq" -> "java.util.List",
+      "scala.Function2" -> "spark.api.java.function.Function2",
+      "scala.collection.Iterator" -> "java.util.Iterator",
+      "scala.collection.mutable.Queue" -> "java.util.Queue",
+      "double" -> "java.lang.Double"
+    )
+    // Keep applying the substitutions until we've reached a fixedpoint.
+    def applySubs(scalaType: SparkType): SparkType = {
+      scalaType match {
+        case ParameterizedType(name, parameters, typebounds) =>
+          name match {
+            case "spark.RDD" =>
+              if (parameters(0).name == classOf[Tuple2[_, _]].getName) {
+                val tupleParams =
+                  parameters(0).asInstanceOf[ParameterizedType].parameters.map(toJavaType)
+                ParameterizedType(classOf[JavaPairRDD[_, _]].getName, tupleParams)
+              } else {
+                ParameterizedType(classOf[JavaRDD[_]].getName, parameters.map(toJavaType))
+              }
+            case "spark.streaming.DStream" =>
+              if (parameters(0).name == classOf[Tuple2[_, _]].getName) {
+                val tupleParams =
+                  parameters(0).asInstanceOf[ParameterizedType].parameters.map(toJavaType)
+                ParameterizedType("spark.streaming.api.java.JavaPairDStream", tupleParams)
+              } else {
+                ParameterizedType("spark.streaming.api.java.JavaDStream",
+                  parameters.map(toJavaType))
+              }
+            // TODO: Spark Streaming uses Guava's Optional in place of Option, leading to some
+            // false-positives here:
+            case "scala.Option" =>
+              toJavaType(parameters(0))
+            case "scala.Function1" =>
+              val firstParamName = parameters.last.name
+              if (firstParamName.startsWith("scala.collection.Traversable") ||
+                firstParamName.startsWith("scala.collection.Iterator")) {
+                ParameterizedType("spark.api.java.function.FlatMapFunction",
+                  Seq(parameters(0),
+                    parameters.last.asInstanceOf[ParameterizedType].parameters(0)).map(toJavaType))
+              } else if (firstParamName == "scala.runtime.BoxedUnit") {
+                ParameterizedType("spark.api.java.function.VoidFunction",
+                  parameters.dropRight(1).map(toJavaType))
+              } else {
+                ParameterizedType("spark.api.java.function.Function", parameters.map(toJavaType))
+              }
+            case _ =>
+              ParameterizedType(renameSubstitutions.getOrElse(name, name),
+                parameters.map(toJavaType))
+          }
+        case BaseType(name) =>
+          if (renameSubstitutions.contains(name)) {
+            BaseType(renameSubstitutions(name))
+          } else {
+            scalaType
+          }
+      }
+    }
+    var oldType = scalaType
+    var newType = applySubs(scalaType)
+    while (oldType != newType) {
+      oldType = newType
+      newType = applySubs(scalaType)
+    }
+    newType
+  }
+  private def toJavaMethod(method: SparkMethod): SparkMethod = {
+    val params = method.parameters
+      .filterNot(_.name == "scala.reflect.ClassManifest").map(toJavaType)
+    SparkMethod(method.name, toJavaType(method.returnType), params)
+  }
+  private def isExcludedByName(method: Method): Boolean = {
+    val name = method.getDeclaringClass.getName + "." + method.getName
+    // Scala methods that are declared as private[mypackage] become public in the resulting
+    // Java bytecode.  As a result, we need to manually exclude those methods here.
+    // This list also includes a few methods that are only used by the web UI or other
+    // internal Spark components.
+    val excludedNames = Seq(
+      "spark.RDD.origin",
+      "spark.RDD.elementClassManifest",
+      "spark.RDD.checkpointData",
+      "spark.RDD.partitioner",
+      "spark.RDD.partitions",
+      "spark.RDD.firstParent",
+      "spark.RDD.doCheckpoint",
+      "spark.RDD.markCheckpointed",
+      "spark.RDD.clearDependencies",
+      "spark.RDD.getDependencies",
+      "spark.RDD.getPartitions",
+      "spark.RDD.dependencies",
+      "spark.RDD.getPreferredLocations",
+      "spark.RDD.collectPartitions",
+      "spark.RDD.computeOrReadCheckpoint",
+      "spark.PairRDDFunctions.getKeyClass",
+      "spark.PairRDDFunctions.getValueClass",
+      "spark.SparkContext.stringToText",
+      "spark.SparkContext.makeRDD",
+      "spark.SparkContext.runJob",
+      "spark.SparkContext.runApproximateJob",
+      "spark.SparkContext.clean",
+      "spark.SparkContext.metadataCleaner",
+      "spark.SparkContext.ui",
+      "spark.SparkContext.newShuffleId",
+      "spark.SparkContext.newRddId",
+      "spark.SparkContext.cleanup",
+      "spark.SparkContext.receiverJobThread",
+      "spark.SparkContext.getRDDStorageInfo",
+      "spark.SparkContext.addedFiles",
+      "spark.SparkContext.addedJars",
+      "spark.SparkContext.persistentRdds",
+      "spark.SparkContext.executorEnvs",
+      "spark.SparkContext.checkpointDir",
+      "spark.SparkContext.getSparkHome",
+      "spark.SparkContext.executorMemoryRequested",
+      "spark.SparkContext.getExecutorStorageStatus",
+      "spark.streaming.DStream.generatedRDDs",
+      "spark.streaming.DStream.zeroTime",
+      "spark.streaming.DStream.rememberDuration",
+      "spark.streaming.DStream.storageLevel",
+      "spark.streaming.DStream.mustCheckpoint",
+      "spark.streaming.DStream.checkpointDuration",
+      "spark.streaming.DStream.checkpointData",
+      "spark.streaming.DStream.graph",
+      "spark.streaming.DStream.isInitialized",
+      "spark.streaming.DStream.parentRememberDuration",
+      "spark.streaming.DStream.initialize",
+      "spark.streaming.DStream.validate",
+      "spark.streaming.DStream.setContext",
+      "spark.streaming.DStream.setGraph",
+      "spark.streaming.DStream.remember",
+      "spark.streaming.DStream.getOrCompute",
+      "spark.streaming.DStream.generateJob",
+      "spark.streaming.DStream.clearOldMetadata",
+      "spark.streaming.DStream.addMetadata",
+      "spark.streaming.DStream.updateCheckpointData",
+      "spark.streaming.DStream.restoreCheckpointData",
+      "spark.streaming.DStream.isTimeValid",
+      "spark.streaming.StreamingContext.nextNetworkInputStreamId",
+      "spark.streaming.StreamingContext.networkInputTracker",
+      "spark.streaming.StreamingContext.checkpointDir",
+      "spark.streaming.StreamingContext.checkpointDuration",
+      "spark.streaming.StreamingContext.receiverJobThread",
+      "spark.streaming.StreamingContext.scheduler",
+      "spark.streaming.StreamingContext.initialCheckpoint",
+      "spark.streaming.StreamingContext.getNewNetworkStreamId",
+      "spark.streaming.StreamingContext.validate",
+      "spark.streaming.StreamingContext.createNewSparkContext",
+      "spark.streaming.StreamingContext.rddToFileName",
+      "spark.streaming.StreamingContext.getSparkCheckpointDir",
+      "spark.streaming.StreamingContext.env",
+      "spark.streaming.StreamingContext.graph",
+      "spark.streaming.StreamingContext.isCheckpointPresent"
+    )
+    val excludedPatterns = Seq(
+      """^spark\.SparkContext\..*To.*Functions""",
+      """^spark\.SparkContext\..*WritableConverter""",
+      """^spark\.SparkContext\..*To.*Writable"""
+    ).map(_.r)
+    lazy val excludedByPattern =
+      !excludedPatterns.map(_.findFirstIn(name)).filter(_.isDefined).isEmpty
+    name.contains("$") || excludedNames.contains(name) || excludedByPattern
+  }
+  private def isExcludedByAnnotation(method: Method): Boolean = {
+    method.getAnnotation(classOf[ExcludeFromJavaAPI]) != null
+  }
+  private def isExcludedByInterface(method: Method): Boolean = {
+    val excludedInterfaces =
+      Set("spark.Logging", "org.apache.hadoop.mapreduce.HadoopMapReduceUtil")
+    def toComparisionKey(method: Method) =
+      (method.getReturnType, method.getName, method.getGenericReturnType)
+    val interfaces = method.getDeclaringClass.getInterfaces.filter { i =>
+      excludedInterfaces.contains(i.getName)
+    }
+    val excludedMethods = interfaces.flatMap(_.getMethods.map(toComparisionKey))
+    excludedMethods.contains(toComparisionKey(method))
+  }
+  private def printMissingMethods(scalaClass: Class[_], javaClass: Class[_]) {
+    val methods = scalaClass.getMethods
+      .filterNot(_.isAccessible)
+      .filterNot(isExcludedByAnnotation)
+      .filterNot(isExcludedByName)
+      .filterNot(isExcludedByInterface)
+    val javaEquivalents = methods.map(m => toJavaMethod(toSparkMethod(m))).toSet
+    val javaMethods = javaClass.getMethods.map(toSparkMethod).toSet
+    val missingMethods = javaEquivalents -- javaMethods
+    for (method <- missingMethods) {
+      println(method)
+    }
+  }
+  def main(args: Array[String]) {
+    println("Missing RDD methods")
+    printMissingMethods(classOf[RDD[_]], classOf[JavaRDD[_]])
+    println()
+    println("Missing PairRDD methods")
+    printMissingMethods(classOf[PairRDDFunctions[_, _]], classOf[JavaPairRDD[_, _]])
+    println()
+    println("Missing DoubleRDD methods")
+    printMissingMethods(classOf[DoubleRDDFunctions], classOf[JavaDoubleRDD])
+    println()
+    println("Missing OrderedRDD methods")
+    printMissingMethods(classOf[OrderedRDDFunctions[_, _]], classOf[JavaPairRDD[_, _]])
+    println()
+    println("Missing SparkContext methods")
+    printMissingMethods(classOf[SparkContext], classOf[JavaSparkContext])
+    println()
+    println("Missing StreamingContext methods")
+    printMissingMethods(classOf[StreamingContext], classOf[JavaStreamingContext])
+    println()
+    println("Missing DStream methods")
+    printMissingMethods(classOf[DStream[_]], classOf[JavaDStream[_]])
+    println()
+    println("Missing PairDStream methods")
+    printMissingMethods(classOf[PairDStreamFunctions[_, _]], classOf[JavaPairDStream[_, _]])
+    println()
+  }
+}