From bb2b9d0a428b86bf366ee9916e26402f8c00912f Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 9 Aug 2016 10:23:54 +0800
Subject: [PATCH] [SPARK-16610][SQL] Add `orc.compress` as an alias for
 `compression` option.

## What changes were proposed in this pull request?

For ORC source, Spark SQL has a writer option `compression`, which is used to set the codec and its value will be also set to `orc.compress` (the orc conf used for codec). However, if a user only set `orc.compress` in the writer option, we should not use the default value of `compression` (snappy) as the codec. Instead, we should respect the value of `orc.compress`.

This PR makes ORC data source not ignoring `orc.compress` when `comperssion` is unset.

So, here is the behaviour,

 1. Check `compression` and use this if it is set.
 2. If `compression` is not set, check `orc.compress` and use it.
 3. If `compression` and `orc.compress` are not set, then use the default snappy.

## How was this patch tested?

Unit test in `OrcQuerySuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #14518 from HyukjinKwon/SPARK-16610.
---
 .../spark/sql/hive/orc/OrcOptions.scala       | 12 +++++++---
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 23 +++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
index 91cf0dc960..c2a126d3bf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.hive.orc
 /**
  * Options for the ORC data source.
  */
-private[orc] class OrcOptions(
-    @transient private val parameters: Map[String, String])
+private[orc] class OrcOptions(@transient private val parameters: Map[String, String])
   extends Serializable {
 
   import OrcOptions._
@@ -31,7 +30,14 @@ private[orc] class OrcOptions(
    * Acceptable values are defined in [[shortOrcCompressionCodecNames]].
    */
   val compressionCodec: String = {
-    val codecName = parameters.getOrElse("compression", "snappy").toLowerCase
+    // `orc.compress` is a ORC configuration. So, here we respect this as an option but
+    // `compression` has higher precedence than `orc.compress`. It means if both are set,
+    // we will use `compression`.
+    val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION)
+    val codecName = parameters
+      .get("compression")
+      .orElse(orcCompressionConf)
+      .getOrElse("snappy").toLowerCase
     if (!shortOrcCompressionCodecNames.contains(codecName)) {
       val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
       throw new IllegalArgumentException(s"Codec [$codecName] " +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 49e963ee12..b13878d578 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -161,6 +161,29 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("SPARK-16610: Respect orc.compress option when compression is unset") {
+    // Respect `orc.compress`.
+    withTempPath { file =>
+      spark.range(0, 10).write
+        .option("orc.compress", "ZLIB")
+        .orc(file.getCanonicalPath)
+      val expectedCompressionKind =
+        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
+      assert("ZLIB" === expectedCompressionKind.name())
+    }
+
+    // `compression` overrides `orc.compress`.
+    withTempPath { file =>
+      spark.range(0, 10).write
+        .option("compression", "ZLIB")
+        .option("orc.compress", "SNAPPY")
+        .orc(file.getCanonicalPath)
+      val expectedCompressionKind =
+        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
+      assert("ZLIB" === expectedCompressionKind.name())
+    }
+  }
+
   // Hive supports zlib, snappy and none for Hive 1.2.1.
   test("Compression options for writing to an ORC file (SNAPPY, ZLIB and NONE)") {
     withTempPath { file =>
-- 
GitLab