From c59abad052b7beec4ef550049413e95578e545be Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 5 Apr 2016 13:31:00 -0700
Subject: [PATCH] [SPARK-14402][SQL] initcap UDF doesn't match Hive/Oracle
 behavior in lowercasing rest of string

## What changes were proposed in this pull request?

Current, SparkSQL `initCap` is using `toTitleCase` function. However, `UTF8String.toTitleCase` implementation changes only the first letter and just copy the other letters: e.g. sParK --> SParK. This is the correct implementation `toTitleCase`.
```
hive> select initcap('sParK');
Spark
```
```
scala> sql("select initcap('sParK')").head
res0: org.apache.spark.sql.Row = [SParK]
```

This PR updates the implementation of `initcap` using `toLowerCase` and `toTitleCase`.

## How was this patch tested?

Pass the Jenkins tests (including new testcase).

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #12175 from dongjoon-hyun/SPARK-14402.
---
 .../sql/catalyst/expressions/stringExpressions.scala  | 11 ++++++++---
 .../catalyst/expressions/StringExpressionsSuite.scala |  1 +
 .../org/apache/spark/sql/StringFunctionsSuite.scala   |  6 +++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 3ee19cc4ad..b6ea03cd5c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -618,19 +618,24 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
 }
 
 /**
- * Returns string, with the first letter of each word in uppercase.
+ * Returns string, with the first letter of each word in uppercase, all other letters in lowercase.
  * Words are delimited by whitespace.
  */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - " +
+    "Returns str, with the first letter of each word in uppercase, all other letters in " +
+    "lowercase. Words are delimited by white space.",
+  extended = "> SELECT initcap('sPark sql');\n 'Spark Sql'")
 case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(StringType)
   override def dataType: DataType = StringType
 
   override def nullSafeEval(string: Any): Any = {
-    string.asInstanceOf[UTF8String].toTitleCase
+    string.asInstanceOf[UTF8String].toLowerCase.toTitleCase
   }
   override def genCode(ctx: CodegenContext, ev: ExprCode): String = {
-    defineCodeGen(ctx, ev, str => s"$str.toTitleCase()")
+    defineCodeGen(ctx, ev, str => s"$str.toLowerCase().toTitleCase()")
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 99e3b13ce8..2cf8ca7000 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -382,6 +382,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(InitCap(Literal("a b")), "A B")
     checkEvaluation(InitCap(Literal(" a")), " A")
     checkEvaluation(InitCap(Literal("the test")), "The Test")
+    checkEvaluation(InitCap(Literal("sParK")), "Spark")
     // scalastyle:off
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     checkEvaluation(InitCap(Literal("世界")), "世界")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index e2090b0a83..6809f26968 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -272,12 +272,12 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("initcap function") {
-    val df = Seq(("ab", "a B")).toDF("l", "r")
+    val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(
-      df.select(initcap($"l"), initcap($"r")), Row("Ab", "A B"))
+      df.select(initcap($"x"), initcap($"y"), initcap($"z")), Row("Ab", "A B", "Spark"))
 
     checkAnswer(
-      df.selectExpr("InitCap(l)", "InitCap(r)"), Row("Ab", "A B"))
+      df.selectExpr("InitCap(x)", "InitCap(y)", "InitCap(z)"), Row("Ab", "A B", "Spark"))
   }
 
   test("number format function") {
-- 
GitLab