SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets

Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course. Author: Sean Owen <srowen@gmail.com> Closes #1663 from srowen/SPARK-2341 and squashes the following commits: 8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes 18a8c8e [Sean Owen] Updates from review 83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)

SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets
e9b275b7 · Sean Owen · Xiangrui Meng · 88a519db · e9b275b7 · e9b275b7
Commit e9b275b7 authored 10 years ago by Sean Owen Committed by Xiangrui Meng 10 years ago
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -91,7 +91,7 @@ object LinearRegression extends App {

    Logger.getRootLogger.setLevel(Level.WARN)

-    val examples = MLUtils.loadLibSVMFile(sc, params.input, multiclass = true).cache()
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()

--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
@@ -22,7 +22,7 @@ import scopt.OptionParser

 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.classification.NaiveBayes
-import org.apache.spark.mllib.util.{MLUtils, MulticlassLabelParser}
+import org.apache.spark.mllib.util.MLUtils

 /**
 * An example naive Bayes app. Run with
@@ -76,7 +76,7 @@ object SparseNaiveBayes {
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
-      MLUtils.loadLibSVMFile(sc, params.input, multiclass = true, params.numFeatures, minPartitions)
+      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()


--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-/** Trait for label parsers. */
-private trait LabelParser extends Serializable {
-  /** Parses a string label into a double label. */
-  def parse(labelString: String): Double
-}
-
-/** Factory methods for label parsers. */
-private object LabelParser {
-  def getInstance(multiclass: Boolean): LabelParser = {
-    if (multiclass) MulticlassLabelParser else BinaryLabelParser
-  }
-}
-
-/**
- * Label parser for binary labels, which outputs 1.0 (positive) if the value is greater than 0.5,
- * or 0.0 (negative) otherwise. So it works with +1/-1 labeling and +1/0 labeling.
- */
-private object BinaryLabelParser extends LabelParser {
-  /** Gets the default instance of BinaryLabelParser. */
-  def getInstance(): LabelParser = this
-
-  /**
-   * Parses the input label into positive (1.0) if the value is greater than 0.5,
-   * or negative (0.0) otherwise.
-   */
-  override def parse(labelString: String): Double = if (labelString.toDouble > 0.5) 1.0 else 0.0
-}
-
-/**
- * Label parser for multiclass labels, which converts the input label to double.
- */
-private object MulticlassLabelParser extends LabelParser {
-  /** Gets the default instance of MulticlassLabelParser. */
-  def getInstance(): LabelParser = this
-
-  override def parse(labelString: String): Double =  labelString.toDouble
-}
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -55,7 +55,6 @@ object MLUtils {
   *
   * @param sc Spark context
   * @param path file or directory path in any Hadoop-supported file system URI
-   * @param labelParser parser for labels
   * @param numFeatures number of features, which will be determined from the input data if a
   *                    nonpositive value is given. This is useful when the dataset is already split
   *                    into multiple files and you want to load them separately, because some
@@ -64,10 +63,9 @@ object MLUtils {
   * @param minPartitions min number of partitions
   * @return labeled data stored as an RDD[LabeledPoint]
   */
-  private def loadLibSVMFile(
+  def loadLibSVMFile(
      sc: SparkContext,
      path: String,
-      labelParser: LabelParser,
      numFeatures: Int,
      minPartitions: Int): RDD[LabeledPoint] = {
    val parsed = sc.textFile(path, minPartitions)
@@ -75,7 +73,7 @@ object MLUtils {
      .filter(line => !(line.isEmpty || line.startsWith("#")))
      .map { line =>
        val items = line.split(' ')
-        val label = labelParser.parse(items.head)
+        val label = items.head.toDouble
        val (indices, values) = items.tail.map { item =>
          val indexAndValue = item.split(':')
          val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
@@ -102,64 +100,46 @@ object MLUtils {

  // Convenient methods for `loadLibSVMFile`.

-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
-   * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
-   * Each line represents a labeled sparse feature vector using the following format:
-   * {{{label index1:value1 index2:value2 ...}}}
-   * where the indices are one-based and in ascending order.
-   * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
-   * where the feature indices are converted to zero-based.
-   *
-   * @param sc Spark context
-   * @param path file or directory path in any Hadoop-supported file system URI
-   * @param multiclass whether the input labels contain more than two classes. If false, any label
-   *                   with value greater than 0.5 will be mapped to 1.0, or 0.0 otherwise. So it
-   *                   works for both +1/-1 and 1/0 cases. If true, the double value parsed directly
-   *                   from the label string will be used as the label value.
-   * @param numFeatures number of features, which will be determined from the input data if a
-   *                    nonpositive value is given. This is useful when the dataset is already split
-   *                    into multiple files and you want to load them separately, because some
-   *                    features may not present in certain files, which leads to inconsistent
-   *                    feature dimensions.
-   * @param minPartitions min number of partitions
-   * @return labeled data stored as an RDD[LabeledPoint]
-   */
-   def loadLibSVMFile(
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+  def loadLibSVMFile(
      sc: SparkContext,
      path: String,
      multiclass: Boolean,
      numFeatures: Int,
      minPartitions: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, LabelParser.getInstance(multiclass), numFeatures, minPartitions)
+    loadLibSVMFile(sc, path, numFeatures, minPartitions)

  /**
   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
   * partitions.
   */
+  def loadLibSVMFile(
+      sc: SparkContext,
+      path: String,
+      numFeatures: Int): RDD[LabeledPoint] =
+    loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
+
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
  def loadLibSVMFile(
      sc: SparkContext,
      path: String,
      multiclass: Boolean,
      numFeatures: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, numFeatures, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, numFeatures)

-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the number of features
-   * determined automatically and the default number of partitions.
-   */
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
  def loadLibSVMFile(
      sc: SparkContext,
      path: String,
      multiclass: Boolean): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path)

  /**
   * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
   * features determined automatically and the default number of partitions.
   */
  def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass = false, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, -1)

  /**
   * Save labeled data in LIBSVM format.

--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-import org.scalatest.FunSuite
-
-class LabelParsersSuite extends FunSuite {
-  test("binary label parser") {
-    for (parser <- Seq(BinaryLabelParser, BinaryLabelParser.getInstance())) {
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("0") === 0.0)
-      assert(parser.parse("-1") === 0.0)
-    }
-  }
-
-  test("multiclass label parser") {
-    for (parser <- Seq(MulticlassLabelParser, MulticlassLabelParser.getInstance())) {
-      assert(parser.parse("0") == 0.0)
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("2") === 2.0)
-      assert(parser.parse("3") === 3.0)
-    }
-  }
-}
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -63,9 +63,9 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
  test("loadLibSVMFile") {
    val lines =
      """
-        |+1 1:1.0 3:2.0 5:3.0
-        |-1
-        |-1 2:4.0 4:5.0 6:6.0
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val tempDir = Files.createTempDir()
    tempDir.deleteOnExit()
@@ -73,7 +73,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
    Files.write(lines, file, Charsets.US_ASCII)
    val path = tempDir.toURI.toString

-    val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 6).collect()
+    val pointsWithNumFeatures = loadLibSVMFile(sc, path, 6).collect()
    val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect()

    for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
@@ -86,11 +86,11 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
      assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), (5, 6.0))))
    }

-    val multiclassPoints = loadLibSVMFile(sc, path, multiclass = true).collect()
+    val multiclassPoints = loadLibSVMFile(sc, path).collect()
    assert(multiclassPoints.length === 3)
    assert(multiclassPoints(0).label === 1.0)
-    assert(multiclassPoints(1).label === -1.0)
-    assert(multiclassPoints(2).label === -1.0)
+    assert(multiclassPoints(1).label === 0.0)
+    assert(multiclassPoints(2).label === 0.0)

    Utils.deleteRecursively(tempDir)
  }

--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -97,6 +97,14 @@ object MimaExcludes {
              "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
            ProblemFilters.exclude[IncompatibleMethTypeProblem](
              "org.apache.spark.mllib.tree.impurity.Variance.calculate")
+          ) ++
+          Seq ( // Package-private classes removed in SPARK-2341
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
          )
        case v if v.startsWith("1.0") =>
          Seq(

--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -29,15 +29,18 @@ class MLUtils:
    Helper methods to load, save and pre-process data used in MLlib.
    """

+    @deprecated
    @staticmethod
    def _parse_libsvm_line(line, multiclass):
+        return _parse_libsvm_line(line)
+
+    @staticmethod
+    def _parse_libsvm_line(line):
        """
        Parses a line in LIBSVM format into (label, indices, values).
        """
        items = line.split(None)
        label = float(items[0])
-        if not multiclass:
-            label = 1.0 if label > 0.5 else 0.0
        nnz = len(items) - 1
        indices = np.zeros(nnz, dtype=np.int32)
        values = np.zeros(nnz)
@@ -64,8 +67,13 @@ class MLUtils:
                            " but got " % type(v))
        return " ".join(items)

+    @deprecated
    @staticmethod
    def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
@@ -81,13 +89,6 @@ class MLUtils:
        @param sc: Spark context
        @param path: file or directory path in any Hadoop-supported file
                     system URI
-        @param multiclass: whether the input labels contain more than
-                           two classes. If false, any label with value
-                           greater than 0.5 will be mapped to 1.0, or
-                           0.0 otherwise. So it works for both +1/-1 and
-                           1/0 cases. If true, the double value parsed
-                           directly from the label string will be used
-                           as the label value.
        @param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
@@ -105,7 +106,7 @@ class MLUtils:
        >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
+        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> type(examples[0]) == LabeledPoint
        True
@@ -124,7 +125,7 @@ class MLUtils:
        """

        lines = sc.textFile(path, minPartitions)
-        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1