Skip to content
Snippets Groups Projects
Commit 7a3e5dc2 authored by Kazuaki Ishizaki's avatar Kazuaki Ishizaki Committed by Xiao Li
Browse files

[SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH]

## What changes were proposed in this pull request?

This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions.

`BIT_LENGTH()` returns the bit length of the given string or binary expression.
`CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`)
`OCTET_LENGTH()` returns the byte length of the given string or binary expression.

## How was this patch tested?

Added new test suites for these three functions

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18046 from kiszk/SPARK-20749.
parent 87ab0cec
No related branches found
No related tags found
No related merge requests found
...@@ -305,6 +305,8 @@ object FunctionRegistry { ...@@ -305,6 +305,8 @@ object FunctionRegistry {
expression[Chr]("char"), expression[Chr]("char"),
expression[Chr]("chr"), expression[Chr]("chr"),
expression[Base64]("base64"), expression[Base64]("base64"),
expression[BitLength]("bit_length"),
expression[Length]("char_length"),
expression[Concat]("concat"), expression[Concat]("concat"),
expression[ConcatWs]("concat_ws"), expression[ConcatWs]("concat_ws"),
expression[Decode]("decode"), expression[Decode]("decode"),
...@@ -321,6 +323,7 @@ object FunctionRegistry { ...@@ -321,6 +323,7 @@ object FunctionRegistry {
expression[Levenshtein]("levenshtein"), expression[Levenshtein]("levenshtein"),
expression[Like]("like"), expression[Like]("like"),
expression[Lower]("lower"), expression[Lower]("lower"),
expression[OctetLength]("octet_length"),
expression[StringLocate]("locate"), expression[StringLocate]("locate"),
expression[StringLPad]("lpad"), expression[StringLPad]("lpad"),
expression[StringTrimLeft]("ltrim"), expression[StringTrimLeft]("ltrim"),
......
...@@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression) ...@@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
} }
/** /**
* A function that return the length of the given string or binary expression. * A function that returns the char length of the given string expression or
* number of bytes of the given binary expression.
*/ */
// scalastyle:off line.size.limit
@ExpressionDescription( @ExpressionDescription(
usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.", usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
extended = """ extended = """
Examples: Examples:
> SELECT _FUNC_('Spark SQL'); > SELECT _FUNC_('Spark SQL');
9 9
""") """)
// scalastyle:on line.size.limit
case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = IntegerType override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
...@@ -1225,6 +1228,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn ...@@ -1225,6 +1228,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
} }
} }
/**
* A function that returns the bit length of the given string or binary expression.
*/
@ExpressionDescription(
usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
extended = """
Examples:
> SELECT _FUNC_('Spark SQL');
72
""")
case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
protected override def nullSafeEval(value: Any): Any = child.dataType match {
case StringType => value.asInstanceOf[UTF8String].numBytes * 8
case BinaryType => value.asInstanceOf[Array[Byte]].length * 8
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
child.dataType match {
case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8")
case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8")
}
}
}
/**
* A function that returns the byte length of the given string or binary expression.
*/
@ExpressionDescription(
usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
extended = """
Examples:
> SELECT _FUNC_('Spark SQL');
9
""")
case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
protected override def nullSafeEval(value: Any): Any = child.dataType match {
case StringType => value.asInstanceOf[UTF8String].numBytes
case BinaryType => value.asInstanceOf[Array[Byte]].length
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
child.dataType match {
case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()")
case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
}
}
}
/** /**
* A function that return the Levenshtein distance between the two given strings. * A function that return the Levenshtein distance between the two given strings.
*/ */
......
...@@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { ...@@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// scalastyle:off // scalastyle:off
// non ascii characters are not allowed in the source code, so we disable the scalastyle. // non ascii characters are not allowed in the source code, so we disable the scalastyle.
checkEvaluation(Length(Literal("a花花c")), 4, create_row(string)) checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
checkEvaluation(OctetLength(Literal("a花花c")), 8, create_row(string))
checkEvaluation(BitLength(Literal("a花花c")), 8 * 8, create_row(string))
// scalastyle:on // scalastyle:on
checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte])) checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte]))
checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte]))
checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte]))
checkEvaluation(Length(a), 5, create_row(string)) checkEvaluation(Length(a), 5, create_row(string))
checkEvaluation(OctetLength(a), 5, create_row(string))
checkEvaluation(BitLength(a), 5 * 8, create_row(string))
checkEvaluation(Length(b), 5, create_row(bytes)) checkEvaluation(Length(b), 5, create_row(bytes))
checkEvaluation(OctetLength(b), 5, create_row(bytes))
checkEvaluation(BitLength(b), 5 * 8, create_row(bytes))
checkEvaluation(Length(a), 0, create_row("")) checkEvaluation(Length(a), 0, create_row(""))
checkEvaluation(OctetLength(a), 0, create_row(""))
checkEvaluation(BitLength(a), 0, create_row(""))
checkEvaluation(Length(b), 0, create_row(Array.empty[Byte])) checkEvaluation(Length(b), 0, create_row(Array.empty[Byte]))
checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte]))
checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte]))
checkEvaluation(Length(a), null, create_row(null)) checkEvaluation(Length(a), null, create_row(null))
checkEvaluation(OctetLength(a), null, create_row(null))
checkEvaluation(BitLength(a), null, create_row(null))
checkEvaluation(Length(b), null, create_row(null)) checkEvaluation(Length(b), null, create_row(null))
checkEvaluation(OctetLength(b), null, create_row(null))
checkEvaluation(BitLength(b), null, create_row(null))
checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string)) checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string))
checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string))
checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string))
checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes)) checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes))
checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes))
} }
test("format_number / FormatNumber") { test("format_number / FormatNumber") {
......
...@@ -80,3 +80,8 @@ select 1 > 0.00001; ...@@ -80,3 +80,8 @@ select 1 > 0.00001;
-- mod -- mod
select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null); select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null);
-- length
select BIT_LENGTH('abc');
select CHAR_LENGTH('abc');
select OCTET_LENGTH('abc');
-- Automatically generated by SQLQueryTestSuite -- Automatically generated by SQLQueryTestSuite
-- Number of queries: 51 -- Number of queries: 54
-- !query 0 -- !query 0
...@@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu ...@@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu
struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double>
-- !query 50 output -- !query 50 output
1 NULL 0 NULL NULL NULL 1 NULL 0 NULL NULL NULL
-- !query 51
select BIT_LENGTH('abc')
-- !query 51 schema
struct<bitlength(abc):int>
-- !query 51 output
24
-- !query 52
select CHAR_LENGTH('abc')
-- !query 52 schema
struct<length(abc):int>
-- !query 52 output
3
-- !query 53
select OCTET_LENGTH('abc')
-- !query 53 schema
struct<octetlength(abc):int>
-- !query 53 output
3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment