From e75e340a406b765608258b49f7e2f1107d4605fb Mon Sep 17 00:00:00 2001
From: Rajesh Balamohan <rbalamohan@apache.org>
Date: Wed, 20 Jan 2016 11:20:26 -0800
Subject: [PATCH] =?UTF-8?q?[SPARK-12925][SQL]=20Improve=20HiveInspectors.u?=
 =?UTF-8?q?nwrap=20for=20StringObjectIns=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Text is in UTF-8 and converting it via "UTF8String.fromString" incurs decoding and encoding, which turns out to be expensive and redundant.  Profiler snapshot details is attached in the JIRA (ref:https://issues.apache.org/jira/secure/attachment/12783331/SPARK-12925_profiler_cpu_samples.png)

Author: Rajesh Balamohan <rbalamohan@apache.org>

Closes #10848 from rajeshbalamohan/SPARK-12925.
---
 .../main/scala/org/apache/spark/sql/hive/HiveInspectors.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 7a260e72eb..5d84feb483 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -320,7 +320,9 @@ private[hive] trait HiveInspectors {
       case hvoi: HiveCharObjectInspector =>
         UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
       case x: StringObjectInspector if x.preferWritable() =>
-        UTF8String.fromString(x.getPrimitiveWritableObject(data).toString)
+        // Text is in UTF-8 already. No need to convert again via fromString
+        val wObj = x.getPrimitiveWritableObject(data)
+        UTF8String.fromBytes(wObj.getBytes, 0, wObj.getLength)
       case x: StringObjectInspector =>
         UTF8String.fromString(x.getPrimitiveJavaObject(data))
       case x: IntObjectInspector if x.preferWritable() => x.get(data)
-- 
GitLab