diff --git a/.gitignore b/.gitignore index 9f8cd0b4cb232d38568ed6e7ef5820d5f578f4df..b4dd1d05a5f6fb62302f345c629bd6732b38e526 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,4 @@ spark-warehouse/ # For R session data .RData .RHistory +.Rhistory diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index f856979c2a814d688b2657bebf106c040454900e..61d47a8c2d9b8607bc33ba0f63bab1f3f94dbd1a 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -176,8 +176,8 @@ setMethod("isLocal", #' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. #' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be -#' truncated and all cells will be aligned right -#' +#' truncated. However, if set greater than zero, truncates strings longer than `truncate` +#' characters and all cells will be aligned right. #' @family SparkDataFrame functions #' @rdname showDF #' @name showDF @@ -193,7 +193,12 @@ setMethod("isLocal", setMethod("showDF", signature(x = "SparkDataFrame"), function(x, numRows = 20, truncate = TRUE) { - s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate) + if (is.logical(truncate) && truncate) { + s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(20)) + } else { + truncate2 <- as.numeric(truncate) + s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(truncate2)) + } cat(s) }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 74def5ce4245dab7af1148e37bc9bd08ee64e24c..7562fa95e3bded4c55b024438f84224e318bb1fb 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1582,7 +1582,15 @@ test_that("showDF()", { "| 30| Andy|\n", "| 19| Justin|\n", "+----+-------+\n", sep = "") + expected2 <- paste("+---+----+\n", + "|age|name|\n", + "+---+----+\n", + "|nul| Mic|\n", + "| 30| And|\n", + "| 19| Jus|\n", + "+---+----+\n", sep = "") expect_output(showDF(df), expected) + expect_output(showDF(df, truncate = 3), expected2) }) test_that("isLocal()", { diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index acf9d08b23a2742841f7a14bc1a892918a535967..a2443ed3d60b6eb485a6a816a39f85f1a19c95b2 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -271,7 +271,9 @@ class DataFrame(object): """Prints the first ``n`` rows to the console. :param n: Number of rows to show. - :param truncate: Whether truncate long strings and align cells right. + :param truncate: If set to True, truncate strings longer than 20 chars by default. + If set to a number greater than one, truncates long strings to length ``truncate`` + and align cells right. >>> df DataFrame[age: int, name: string] @@ -282,8 +284,18 @@ class DataFrame(object): | 2|Alice| | 5| Bob| +---+-----+ - """ - print(self._jdf.showString(n, truncate)) + >>> df.show(truncate=3) + +---+----+ + |age|name| + +---+----+ + | 2| Ali| + | 5| Bob| + +---+----+ + """ + if isinstance(truncate, bool) and truncate: + print(self._jdf.showString(n, 20)) + else: + print(self._jdf.showString(n, int(truncate))) def __repr__(self): return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 85d060639c7ff27b959321644d09c589c21ff121..9997162f7cfcfa02c35ead769cc125a2db194ca0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -232,16 +232,18 @@ class Dataset[T] private[sql]( * Compose the string representing rows for output * * @param _numRows Number of rows to show - * @param truncate Whether truncate long strings and align cells right + * @param truncate If set to more than 0, truncates strings to `truncate` characters and + * all cells will be aligned right. */ - private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = { + private[sql] def showString(_numRows: Int, truncate: Int = 20): String = { val numRows = _numRows.max(0) val takeResult = toDF().take(numRows + 1) val hasMoreData = takeResult.length > numRows val data = takeResult.take(numRows) // For array values, replace Seq and Array with square brackets - // For cells that are beyond 20 characters, replace it with the first 17 and "..." + // For cells that are beyond `truncate` characters, replace it with the + // first `truncate-3` and "..." val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row => row.toSeq.map { cell => val str = cell match { @@ -251,7 +253,13 @@ class Dataset[T] private[sql]( case seq: Seq[_] => seq.mkString("[", ", ", "]") case _ => cell.toString } - if (truncate && str.length > 20) str.substring(0, 17) + "..." else str + if (truncate > 0 && str.length > truncate) { + // do not show ellipses for strings shorter than 4 characters. + if (truncate < 4) str.substring(0, truncate) + else str.substring(0, truncate - 3) + "..." + } else { + str + } }: Seq[String] } @@ -273,7 +281,7 @@ class Dataset[T] private[sql]( // column names rows.head.zipWithIndex.map { case (cell, i) => - if (truncate) { + if (truncate > 0) { StringUtils.leftPad(cell, colWidths(i)) } else { StringUtils.rightPad(cell, colWidths(i)) @@ -285,7 +293,7 @@ class Dataset[T] private[sql]( // data rows.tail.map { _.zipWithIndex.map { case (cell, i) => - if (truncate) { + if (truncate > 0) { StringUtils.leftPad(cell.toString, colWidths(i)) } else { StringUtils.rightPad(cell.toString, colWidths(i)) @@ -523,7 +531,32 @@ class Dataset[T] private[sql]( * @since 1.6.0 */ // scalastyle:off println - def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate)) + def show(numRows: Int, truncate: Boolean): Unit = if (truncate) { + println(showString(numRows, truncate = 20)) + } else { + println(showString(numRows, truncate = 0)) + } + // scalastyle:on println + + /** + * Displays the Dataset in a tabular form. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * + * @param numRows Number of rows to show + * @param truncate If set to more than 0, truncates strings to `truncate` characters and + * all cells will be aligned right. + * @group action + * @since 1.6.0 + */ + // scalastyle:off println + def show(numRows: Int, truncate: Int): Unit = println(showString(numRows, truncate)) // scalastyle:on println /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 1afee9f02104ed96cb6d3ab332a5341d76f42c77..6a0a7df3f4a59cbede3cccac229993fac0529613 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -723,7 +723,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { testData.select($"*").show(1000) } - test("showString: truncate = [true, false]") { + test("showString: truncate = [0, 20]") { val longString = Array.fill(21)("1").mkString val df = sparkContext.parallelize(Seq("1", longString)).toDF() val expectedAnswerForFalse = """+---------------------+ @@ -733,7 +733,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { ||111111111111111111111| |+---------------------+ |""".stripMargin - assert(df.showString(10, false) === expectedAnswerForFalse) + assert(df.showString(10, truncate = 0) === expectedAnswerForFalse) val expectedAnswerForTrue = """+--------------------+ || value| |+--------------------+ @@ -741,7 +741,28 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { ||11111111111111111...| |+--------------------+ |""".stripMargin - assert(df.showString(10, true) === expectedAnswerForTrue) + assert(df.showString(10, truncate = 20) === expectedAnswerForTrue) + } + + test("showString: truncate = [3, 17]") { + val longString = Array.fill(21)("1").mkString + val df = sparkContext.parallelize(Seq("1", longString)).toDF() + val expectedAnswerForFalse = """+-----+ + ||value| + |+-----+ + || 1| + || 111| + |+-----+ + |""".stripMargin + assert(df.showString(10, truncate = 3) === expectedAnswerForFalse) + val expectedAnswerForTrue = """+-----------------+ + || value| + |+-----------------+ + || 1| + ||11111111111111...| + |+-----------------+ + |""".stripMargin + assert(df.showString(10, truncate = 17) === expectedAnswerForTrue) } test("showString(negative)") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 43cbc03b7aa0c44244ff2e8a6d536592128c3686..0b6f40872f2e56783109655d012b16298bf9e045 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -730,7 +730,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { private def checkShowString[T](ds: Dataset[T], expected: String): Unit = { val numRows = expected.split("\n").length - 4 - val actual = ds.showString(numRows, truncate = true) + val actual = ds.showString(numRows, truncate = 20) if (expected != actual) { fail(