From 7a3e5dc28b67ac1630c5a578a27a5a5acf80aa51 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki <ishizaki@jp.ibm.com> Date: Thu, 15 Jun 2017 23:06:58 -0700 Subject: [PATCH] [SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH] ## What changes were proposed in this pull request? This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions. `BIT_LENGTH()` returns the bit length of the given string or binary expression. `CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`) `OCTET_LENGTH()` returns the byte length of the given string or binary expression. ## How was this patch tested? Added new test suites for these three functions Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com> Closes #18046 from kiszk/SPARK-20749. --- .../catalyst/analysis/FunctionRegistry.scala | 3 + .../expressions/stringExpressions.scala | 61 ++++++++++++++++++- .../expressions/StringExpressionsSuite.scala | 20 ++++++ .../resources/sql-tests/inputs/operators.sql | 5 ++ .../sql-tests/results/operators.sql.out | 26 +++++++- 5 files changed, 112 insertions(+), 3 deletions(-) mode change 100644 => 100755 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 877328164a..e4e9918a3a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -305,6 +305,8 @@ object FunctionRegistry { expression[Chr]("char"), expression[Chr]("chr"), expression[Base64]("base64"), + expression[BitLength]("bit_length"), + expression[Length]("char_length"), expression[Concat]("concat"), expression[ConcatWs]("concat_ws"), expression[Decode]("decode"), @@ -321,6 +323,7 @@ object FunctionRegistry { expression[Levenshtein]("levenshtein"), expression[Like]("like"), expression[Lower]("lower"), + expression[OctetLength]("octet_length"), expression[StringLocate]("locate"), expression[StringLPad]("lpad"), expression[StringTrimLeft]("ltrim"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala old mode 100644 new mode 100755 index 717ada225a..908fdb8f7e --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression) } /** - * A function that return the length of the given string or binary expression. + * A function that returns the char length of the given string expression or + * number of bytes of the given binary expression. */ +// scalastyle:off line.size.limit @ExpressionDescription( - usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.", + usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.", extended = """ Examples: > SELECT _FUNC_('Spark SQL'); 9 """) +// scalastyle:on line.size.limit case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { override def dataType: DataType = IntegerType override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) @@ -1225,6 +1228,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn } } +/** + * A function that returns the bit length of the given string or binary expression. + */ +@ExpressionDescription( + usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.", + extended = """ + Examples: + > SELECT _FUNC_('Spark SQL'); + 72 + """) +case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { + override def dataType: DataType = IntegerType + override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + + protected override def nullSafeEval(value: Any): Any = child.dataType match { + case StringType => value.asInstanceOf[UTF8String].numBytes * 8 + case BinaryType => value.asInstanceOf[Array[Byte]].length * 8 + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + child.dataType match { + case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8") + case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8") + } + } +} + +/** + * A function that returns the byte length of the given string or binary expression. + */ +@ExpressionDescription( + usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.", + extended = """ + Examples: + > SELECT _FUNC_('Spark SQL'); + 9 + """) +case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { + override def dataType: DataType = IntegerType + override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + + protected override def nullSafeEval(value: Any): Any = child.dataType match { + case StringType => value.asInstanceOf[UTF8String].numBytes + case BinaryType => value.asInstanceOf[Array[Byte]].length + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + child.dataType match { + case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()") + case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length") + } + } +} + /** * A function that return the Levenshtein distance between the two given strings. */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 4bdb43bfed..4f08031153 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:off // non ascii characters are not allowed in the source code, so we disable the scalastyle. checkEvaluation(Length(Literal("a花花c")), 4, create_row(string)) + checkEvaluation(OctetLength(Literal("a花花c")), 8, create_row(string)) + checkEvaluation(BitLength(Literal("a花花c")), 8 * 8, create_row(string)) // scalastyle:on checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte])) + checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte])) + checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte])) checkEvaluation(Length(a), 5, create_row(string)) + checkEvaluation(OctetLength(a), 5, create_row(string)) + checkEvaluation(BitLength(a), 5 * 8, create_row(string)) checkEvaluation(Length(b), 5, create_row(bytes)) + checkEvaluation(OctetLength(b), 5, create_row(bytes)) + checkEvaluation(BitLength(b), 5 * 8, create_row(bytes)) checkEvaluation(Length(a), 0, create_row("")) + checkEvaluation(OctetLength(a), 0, create_row("")) + checkEvaluation(BitLength(a), 0, create_row("")) checkEvaluation(Length(b), 0, create_row(Array.empty[Byte])) + checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte])) + checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte])) checkEvaluation(Length(a), null, create_row(null)) + checkEvaluation(OctetLength(a), null, create_row(null)) + checkEvaluation(BitLength(a), null, create_row(null)) checkEvaluation(Length(b), null, create_row(null)) + checkEvaluation(OctetLength(b), null, create_row(null)) + checkEvaluation(BitLength(b), null, create_row(null)) checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string)) + checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string)) + checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string)) checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes)) + checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes)) + checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes)) } test("format_number / FormatNumber") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql index 3934620577..a8de23e738 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql @@ -80,3 +80,8 @@ select 1 > 0.00001; -- mod select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null); + +-- length +select BIT_LENGTH('abc'); +select CHAR_LENGTH('abc'); +select OCTET_LENGTH('abc'); diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index 51ccf764d9..85ee10b4d2 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 51 +-- Number of queries: 54 -- !query 0 @@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> -- !query 50 output 1 NULL 0 NULL NULL NULL + + +-- !query 51 +select BIT_LENGTH('abc') +-- !query 51 schema +struct<bitlength(abc):int> +-- !query 51 output +24 + + +-- !query 52 +select CHAR_LENGTH('abc') +-- !query 52 schema +struct<length(abc):int> +-- !query 52 output +3 + + +-- !query 53 +select OCTET_LENGTH('abc') +-- !query 53 schema +struct<octetlength(abc):int> +-- !query 53 output +3 -- GitLab