Skip to content
Snippets Groups Projects
Commit e7082cae authored by Cheng Lian's avatar Cheng Lian
Browse files

[SPARK-15550][SQL] Dataset.show() should show contents nested products as rows

## What changes were proposed in this pull request?

This PR addresses two related issues:

1. `Dataset.showString()` should show case classes/Java beans at all levels as rows, while master code only handles top level ones.

2. `Dataset.showString()` should show full contents produced the underlying query plan

   Dataset is only a view of the underlying query plan. Columns not referred by the encoder are still reachable using methods like `Dataset.col`. So it probably makes more sense to show full contents of the query plan.

## How was this patch tested?

Two new test cases are added in `DatasetSuite` to check `.showString()` output.

Author: Cheng Lian <lian@databricks.com>

Closes #13331 from liancheng/spark-15550-ds-show.
parent fe6de16f
No related branches found
No related tags found
No related merge requests found
...@@ -237,19 +237,13 @@ class Dataset[T] private[sql]( ...@@ -237,19 +237,13 @@ class Dataset[T] private[sql](
*/ */
private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = { private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
val numRows = _numRows.max(0) val numRows = _numRows.max(0)
val takeResult = take(numRows + 1) val takeResult = toDF().take(numRows + 1)
val hasMoreData = takeResult.length > numRows val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows) val data = takeResult.take(numRows)
// For array values, replace Seq and Array with square brackets // For array values, replace Seq and Array with square brackets
// For cells that are beyond 20 characters, replace it with the first 17 and "..." // For cells that are beyond 20 characters, replace it with the first 17 and "..."
val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
case r: Row => r
case tuple: Product => Row.fromTuple(tuple)
case definedByCtor: DefinedByConstructorParams =>
Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor))
case o => Row(o)
}.map { row =>
row.toSeq.map { cell => row.toSeq.map { cell =>
val str = cell match { val str = cell match {
case null => "null" case null => "null"
......
...@@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp} ...@@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp}
import scala.language.postfixOps import scala.language.postfixOps
import org.scalatest.words.MatcherWords.be
import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder} import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
import org.apache.spark.sql.catalyst.util.sideBySide
import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.SharedSQLContext
...@@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ...@@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
checkDataset( checkDataset(
ds.filter(_._1 == "b").select(expr("_1").as[String]), ds.filter(_._1 == "b").select(expr("_1").as[String]),
("b")) "b")
} }
test("foreach") { test("foreach") {
...@@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ...@@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
assert(ds.toString == "[_1: int, _2: int]") assert(ds.toString == "[_1: int, _2: int]")
} }
test("showString: Kryo encoder") {
implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = Seq(KryoData(1), KryoData(2)).toDS()
val expectedAnswer = """+-----------+
|| value|
|+-----------+
||KryoData(1)|
||KryoData(2)|
|+-----------+
|""".stripMargin
assert(ds.showString(10) === expectedAnswer)
}
test("Kryo encoder") { test("Kryo encoder") {
implicit val kryoEncoder = Encoders.kryo[KryoData] implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = Seq(KryoData(1), KryoData(2)).toDS() val ds = Seq(KryoData(1), KryoData(2)).toDS()
...@@ -677,7 +662,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ...@@ -677,7 +662,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
} }
test("dataset.rdd with generic case class") { test("dataset.rdd with generic case class") {
val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS()
val ds2 = ds.map(g => Generic(g.id, g.value)) val ds2 = ds.map(g => Generic(g.id, g.value))
assert(ds.rdd.map(r => r.id).count === 2) assert(ds.rdd.map(r => r.id).count === 2)
assert(ds2.rdd.map(r => r.id).count === 2) assert(ds2.rdd.map(r => r.id).count === 2)
...@@ -731,6 +716,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ...@@ -731,6 +716,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
val df = Seq(1 -> 2).toDF("a", "b") val df = Seq(1 -> 2).toDF("a", "b")
checkAnswer(df.map(row => row)(RowEncoder(df.schema)).select("b", "a"), Row(2, 1)) checkAnswer(df.map(row => row)(RowEncoder(df.schema)).select("b", "a"), Row(2, 1))
} }
private def checkShowString[T](ds: Dataset[T], expected: String): Unit = {
val numRows = expected.split("\n").length - 4
val actual = ds.showString(numRows, truncate = true)
if (expected != actual) {
fail(
"Dataset.showString() gives wrong result:\n\n" + sideBySide(
"== Expected ==\n" + expected,
"== Actual ==\n" + actual
).mkString("\n")
)
}
}
test("SPARK-15550 Dataset.show() should show contents of the underlying logical plan") {
val df = Seq((1, "foo", "extra"), (2, "bar", "extra")).toDF("b", "a", "c")
val ds = df.as[ClassData]
val expected =
"""+---+---+-----+
|| b| a| c|
|+---+---+-----+
|| 1|foo|extra|
|| 2|bar|extra|
|+---+---+-----+
|""".stripMargin
checkShowString(ds, expected)
}
test("SPARK-15550 Dataset.show() should show inner nested products as rows") {
val ds = Seq(
NestedStruct(ClassData("foo", 1)),
NestedStruct(ClassData("bar", 2))
).toDS()
val expected =
"""+-------+
|| f|
|+-------+
||[foo,1]|
||[bar,2]|
|+-------+
|""".stripMargin
checkShowString(ds, expected)
}
} }
case class Generic[T](id: T, value: Double) case class Generic[T](id: T, value: Double)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment