From 7a3e5dc28b67ac1630c5a578a27a5a5acf80aa51 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 15 Jun 2017 23:06:58 -0700
Subject: [PATCH] [SPARK-20749][SQL] Built-in SQL Function Support - all
 variants of LEN[GTH]

## What changes were proposed in this pull request?

This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions.

`BIT_LENGTH()` returns the bit length of the given string or binary expression.
`CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`)
`OCTET_LENGTH()` returns the byte length of the given string or binary expression.

## How was this patch tested?

Added new test suites for these three functions

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18046 from kiszk/SPARK-20749.
---
 .../catalyst/analysis/FunctionRegistry.scala  |  3 +
 .../expressions/stringExpressions.scala       | 61 ++++++++++++++++++-
 .../expressions/StringExpressionsSuite.scala  | 20 ++++++
 .../resources/sql-tests/inputs/operators.sql  |  5 ++
 .../sql-tests/results/operators.sql.out       | 26 +++++++-
 5 files changed, 112 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 877328164a..e4e9918a3a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -305,6 +305,8 @@ object FunctionRegistry {
     expression[Chr]("char"),
     expression[Chr]("chr"),
     expression[Base64]("base64"),
+    expression[BitLength]("bit_length"),
+    expression[Length]("char_length"),
     expression[Concat]("concat"),
     expression[ConcatWs]("concat_ws"),
     expression[Decode]("decode"),
@@ -321,6 +323,7 @@ object FunctionRegistry {
     expression[Levenshtein]("levenshtein"),
     expression[Like]("like"),
     expression[Lower]("lower"),
+    expression[OctetLength]("octet_length"),
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
     expression[StringTrimLeft]("ltrim"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
old mode 100644
new mode 100755
index 717ada225a..908fdb8f7e
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
 }
 
 /**
- * A function that return the length of the given string or binary expression.
+ * A function that returns the char length of the given string expression or
+ * number of bytes of the given binary expression.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.",
+  usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
   extended = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        9
   """)
+// scalastyle:on line.size.limit
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1225,6 +1228,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
   }
 }
 
+/**
+ * A function that returns the bit length of the given string or binary expression.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       72
+  """)
+case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+  protected override def nullSafeEval(value: Any): Any = child.dataType match {
+    case StringType => value.asInstanceOf[UTF8String].numBytes * 8
+    case BinaryType => value.asInstanceOf[Array[Byte]].length * 8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.dataType match {
+      case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8")
+      case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8")
+    }
+  }
+}
+
+/**
+ * A function that returns the byte length of the given string or binary expression.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       9
+  """)
+case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+  protected override def nullSafeEval(value: Any): Any = child.dataType match {
+    case StringType => value.asInstanceOf[UTF8String].numBytes
+    case BinaryType => value.asInstanceOf[Array[Byte]].length
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.dataType match {
+      case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()")
+      case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
+    }
+  }
+}
+
 /**
  * A function that return the Levenshtein distance between the two given strings.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 4bdb43bfed..4f08031153 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:off
     // non ascii characters are not allowed in the source code, so we disable the scalastyle.
     checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
+    checkEvaluation(OctetLength(Literal("a花花c")), 8, create_row(string))
+    checkEvaluation(BitLength(Literal("a花花c")), 8 * 8, create_row(string))
     // scalastyle:on
     checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+    checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+    checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), 5, create_row(string))
+    checkEvaluation(OctetLength(a), 5, create_row(string))
+    checkEvaluation(BitLength(a), 5 * 8, create_row(string))
     checkEvaluation(Length(b), 5, create_row(bytes))
+    checkEvaluation(OctetLength(b), 5, create_row(bytes))
+    checkEvaluation(BitLength(b), 5 * 8, create_row(bytes))
 
     checkEvaluation(Length(a), 0, create_row(""))
+    checkEvaluation(OctetLength(a), 0, create_row(""))
+    checkEvaluation(BitLength(a), 0, create_row(""))
     checkEvaluation(Length(b), 0, create_row(Array.empty[Byte]))
+    checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte]))
+    checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), null, create_row(null))
+    checkEvaluation(OctetLength(a), null, create_row(null))
+    checkEvaluation(BitLength(a), null, create_row(null))
     checkEvaluation(Length(b), null, create_row(null))
+    checkEvaluation(OctetLength(b), null, create_row(null))
+    checkEvaluation(BitLength(b), null, create_row(null))
 
     checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string))
+    checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string))
+    checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string))
     checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
+    checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes))
+    checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes))
   }
 
   test("format_number / FormatNumber") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
index 3934620577..a8de23e738 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
@@ -80,3 +80,8 @@ select 1 > 0.00001;
 
 -- mod
 select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null);
+
+-- length
+select BIT_LENGTH('abc');
+select CHAR_LENGTH('abc');
+select OCTET_LENGTH('abc');
diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
index 51ccf764d9..85ee10b4d2 100644
--- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 51
+-- Number of queries: 54
 
 
 -- !query 0
@@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu
 struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double>
 -- !query 50 output
 1	NULL	0	NULL	NULL	NULL
+
+
+-- !query 51
+select BIT_LENGTH('abc')
+-- !query 51 schema
+struct<bitlength(abc):int>
+-- !query 51 output
+24
+
+
+-- !query 52
+select CHAR_LENGTH('abc')
+-- !query 52 schema
+struct<length(abc):int>
+-- !query 52 output
+3
+
+
+-- !query 53
+select OCTET_LENGTH('abc')
+-- !query 53 schema
+struct<octetlength(abc):int>
+-- !query 53 output
+3
-- 
GitLab