diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java index d554377975b1b7e66921014feef07a508a1c459a..0a6e9c2a1f93c218462d0e6f76b23de7acbdd6d6 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java @@ -58,7 +58,7 @@ public class JavaBinarizerExample { .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); Dataset<Row> binarizedFeatures = binarizedDataFrame.select("binarized_feature"); - for (Row r : binarizedFeatures.collectRows()) { + for (Row r : binarizedFeatures.collectAsList()) { Double binarized_value = r.getDouble(0); System.out.println(binarized_value); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java index 90bc94c45bbf938d55b44454573781382994b2b7..07edeb3e521c30b4533294b24a75228f86c85612 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java @@ -117,7 +117,7 @@ public class JavaCrossValidatorExample { // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> predictions = cvModel.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collectRows()) { + for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index e8ae100d685294ba7ec7d7245015f00924088c51..8a10dd48aa72fa4bbc56a73d944d8d144df944e1 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -85,7 +85,7 @@ public class JavaDeveloperApiExample { // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> results = model.transform(test); double sumPredictions = 0; - for (Row r : results.select("features", "label", "prediction").collectRows()) { + for (Row r : results.select("features", "label", "prediction").collectAsList()) { sumPredictions += r.getDouble(2); } if (sumPredictions != 0.0) { diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java index f13698ae5e07ed1cf7fc9d3aa2500f5895af12e0..604b193dd489bad916c30079d3de081561ad4170 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java @@ -100,7 +100,8 @@ public class JavaEstimatorTransformerParamExample { // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. Dataset<Row> results = model2.transform(test); - for (Row r : results.select("features", "label", "myProbability", "prediction").collectRows()) { + Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); + for (Row r: rows.collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java index e394605db70ea001b39b9ffe75f90ab66cd47e7e..c4122d1247a94f5a4deb1929732f9a8accc6f9af 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java @@ -111,7 +111,7 @@ public class JavaModelSelectionViaCrossValidationExample { // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> predictions = cvModel.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collectRows()) { + for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java index 0305f737ca94ca7ad234099b2e2fd7714be6febc..608bd802856558b98760e588264f4c79036c5c78 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java @@ -60,7 +60,7 @@ public class JavaNGramExample { Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame); - for (Row r : ngramDataFrame.select("ngrams", "label").takeRows(3)) { + for (Row r : ngramDataFrame.select("ngrams", "label").takeAsList(3)) { java.util.List<String> ngrams = r.getList(0); for (String ngram : ngrams) System.out.print(ngram + " --- "); System.out.println(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java index 6ae418d564d1fecffe02bda1200bb6223a9ec05b..305420f208b79ad5e8e82c13d876bd8abbd07982 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java @@ -80,7 +80,7 @@ public class JavaPipelineExample { // Make predictions on test documents. Dataset<Row> predictions = model.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collectRows()) { + for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 5a4064c604301965523d0ba0b3bd142f3b9922e9..48fc3c8acb0c0d8be4d5abba0af53f52b90670be 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -23,6 +23,7 @@ import org.apache.spark.sql.SQLContext; // $example on$ import java.util.Arrays; +import java.util.List; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PolynomialExpansion; @@ -61,8 +62,8 @@ public class JavaPolynomialExpansionExample { Dataset<Row> df = jsql.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); - Row[] row = polyDF.select("polyFeatures").takeRows(3); - for (Row r : row) { + List<Row> rows = polyDF.select("polyFeatures").takeAsList(3); + for (Row r : rows) { System.out.println(r.get(0)); } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index 52bb4ec050376ef289a72fa73de20e087610194e..cb911ef5ef5869930553b21cfb6235de547260ae 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -103,7 +103,8 @@ public class JavaSimpleParamsExample { // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. Dataset<Row> results = model2.transform(test); - for (Row r: results.select("features", "label", "myProbability", "prediction").collectRows()) { + Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); + for (Row r: rows.collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index 9bd543c44f98330cebaf6fee547617cd7ee89326..a18a60f4481666947433df54d4564c9bda7efe2a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -84,7 +84,7 @@ public class JavaSimpleTextClassificationPipeline { // Make predictions on test documents. Dataset<Row> predictions = model.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collectRows()) { + for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java index fd1ce424bf8c441672566800053e1c191ab87211..37a3d0d84dae2c22a674d1be2925c5edb876f3d7 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java @@ -66,7 +66,7 @@ public class JavaTfIdfExample { IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); - for (Row r : rescaledData.select("features", "label").takeRows(3)) { + for (Row r : rescaledData.select("features", "label").takeAsList(3)) { Vector features = r.getAs(0); Double label = r.getDouble(1); System.out.println(features); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index a2f8c436e32f6f44f4ec76670d52927a659dd1d9..9225fe2262f57105f5067249b8b0319bef1cdd2a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -59,7 +59,7 @@ public class JavaTokenizerExample { Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame); - for (Row r : wordsDataFrame.select("words", "label").takeRows(3)) { + for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) { java.util.List<String> words = r.getList(0); for (String word : words) System.out.print(word + " "); System.out.println(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java index 2dce8c2168c2df5a9beb1cca04e398c9afddb83e..c5bb1eaaa34461055bc8d0dc1745d56249110b14 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java @@ -59,7 +59,7 @@ public class JavaWord2VecExample { .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); - for (Row r : result.select("result").takeRows(3)) { + for (Row r : result.select("result").takeAsList(3)) { System.out.println(r); } // $example off$ diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java index b87605ebfd6a3d29ae40c4952af1b4aacf03fcc6..e2da11183b93f1ccf30fc3826dd7d63493cdb817 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java @@ -78,7 +78,7 @@ public class JavaVectorSlicerSuite { Dataset<Row> output = vectorSlicer.transform(dataset); - for (Row r : output.select("userFeatures", "features").takeRows(2)) { + for (Row r : output.select("userFeatures", "features").takeAsList(2)) { Vector features = r.getAs(1); Assert.assertEquals(features.size(), 2); } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index f1791e6943bb72c3205746de2da5edcc7b9318d7..1ea7db0388689b4f5c4fd08b307599234ec54ef4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1762,10 +1762,6 @@ class Dataset[T] private[sql]( */ def take(n: Int): Array[T] = head(n) - def takeRows(n: Int): Array[Row] = withTypedCallback("takeRows", limit(n)) { ds => - ds.collectRows(needCallback = false) - } - /** * Returns the first `n` rows in the [[DataFrame]] as a list. * @@ -1790,8 +1786,6 @@ class Dataset[T] private[sql]( */ def collect(): Array[T] = collect(needCallback = true) - def collectRows(): Array[Row] = collectRows(needCallback = true) - /** * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]]. * @@ -1820,18 +1814,6 @@ class Dataset[T] private[sql]( } } - private def collectRows(needCallback: Boolean): Array[Row] = { - def execute(): Array[Row] = withNewExecutionId { - queryExecution.executedPlan.executeCollectPublic() - } - - if (needCallback) { - withCallback("collect", toDF())(_ => execute()) - } else { - execute() - } - } - /** * Returns the number of rows in the [[DataFrame]]. * @group action diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java index 42af813bc1cd3256f82864516c83acd7f74fd83e..ae9c8cc1ba9ff574eae99d822b2881be0ac1e6f6 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java @@ -109,13 +109,13 @@ public class JavaApplySchemaSuite implements Serializable { Dataset<Row> df = sqlContext.createDataFrame(rowRDD, schema); df.registerTempTable("people"); - Row[] actual = sqlContext.sql("SELECT * FROM people").collectRows(); + List<Row> actual = sqlContext.sql("SELECT * FROM people").collectAsList(); List<Row> expected = new ArrayList<>(2); expected.add(RowFactory.create("Michael", 29)); expected.add(RowFactory.create("Yin", 28)); - Assert.assertEquals(expected, Arrays.asList(actual)); + Assert.assertEquals(expected, actual); } @Test diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index 47cc74dbc1f28216b8746e84a7c375f501c85aa9..42554720edae5195143b4c55271499e6cf149bd7 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -19,6 +19,7 @@ package test.org.apache.spark.sql; import java.io.Serializable; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; @@ -65,7 +66,7 @@ public class JavaDataFrameSuite { @Test public void testExecution() { Dataset<Row> df = context.table("testData").filter("key = 1"); - Assert.assertEquals(1, df.select("key").collectRows()[0].get(0)); + Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); } @Test @@ -208,8 +209,8 @@ public class JavaDataFrameSuite { StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true))); List<Row> rows = Arrays.asList(RowFactory.create(0)); Dataset<Row> df = context.createDataFrame(rows, schema); - Row[] result = df.collectRows(); - Assert.assertEquals(1, result.length); + List<Row> result = df.collectAsList(); + Assert.assertEquals(1, result.size()); } @Test @@ -241,8 +242,8 @@ public class JavaDataFrameSuite { Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("2", columnNames[1]); Assert.assertEquals("1", columnNames[2]); - Row[] rows = crosstab.collectRows(); - Arrays.sort(rows, crosstabRowComparator); + List<Row> rows = crosstab.collectAsList(); + Collections.sort(rows, crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); @@ -257,7 +258,7 @@ public class JavaDataFrameSuite { Dataset<Row> df = context.table("testData2"); String[] cols = {"a"}; Dataset<Row> results = df.stat().freqItems(cols, 0.2); - Assert.assertTrue(results.collectRows()[0].getSeq(0).contains(1)); + Assert.assertTrue(results.collectAsList().get(0).getSeq(0).contains(1)); } @Test @@ -278,27 +279,27 @@ public class JavaDataFrameSuite { public void testSampleBy() { Dataset<Row> df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); - Row[] actual = sampled.groupBy("key").count().orderBy("key").collectRows(); - Assert.assertEquals(0, actual[0].getLong(0)); - Assert.assertTrue(0 <= actual[0].getLong(1) && actual[0].getLong(1) <= 8); - Assert.assertEquals(1, actual[1].getLong(0)); - Assert.assertTrue(2 <= actual[1].getLong(1) && actual[1].getLong(1) <= 13); + List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); + Assert.assertEquals(0, actual.get(0).getLong(0)); + Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); + Assert.assertEquals(1, actual.get(1).getLong(0)); + Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); } @Test public void pivot() { Dataset<Row> df = context.table("courseSales"); - Row[] actual = df.groupBy("year") + List<Row> actual = df.groupBy("year") .pivot("course", Arrays.<Object>asList("dotNET", "Java")) - .agg(sum("earnings")).orderBy("year").collectRows(); + .agg(sum("earnings")).orderBy("year").collectAsList(); - Assert.assertEquals(2012, actual[0].getInt(0)); - Assert.assertEquals(15000.0, actual[0].getDouble(1), 0.01); - Assert.assertEquals(20000.0, actual[0].getDouble(2), 0.01); + Assert.assertEquals(2012, actual.get(0).getInt(0)); + Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); + Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); - Assert.assertEquals(2013, actual[1].getInt(0)); - Assert.assertEquals(48000.0, actual[1].getDouble(1), 0.01); - Assert.assertEquals(30000.0, actual[1].getDouble(2), 0.01); + Assert.assertEquals(2013, actual.get(1).getInt(0)); + Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); + Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); } @Test