From 8c54f1eb71d7338d3bd8224a57a293a2e7875252 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 15 Aug 2017 23:00:13 -0700
Subject: [PATCH] [SPARK-21422][BUILD] Depend on Apache ORC 1.4.0

## What changes were proposed in this pull request?

Like Parquet, this PR aims to depend on the latest Apache ORC 1.4 for Apache Spark 2.3. There are key benefits for Apache ORC 1.4.

- Stability: Apache ORC 1.4.0 has many fixes and we can depend on ORC community more.
- Maintainability: Reduce the Hive dependency and can remove old legacy code later.

Later, we can get the following two key benefits by adding new ORCFileFormat in SPARK-20728 (#17980), too.
- Usability: User can use ORC data sources without hive module, i.e, -Phive.
- Speed: Use both Spark ColumnarBatch and ORC RowBatch together. This will be faster than the current implementation in Spark.

## How was this patch tested?

Pass the jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18640 from dongjoon-hyun/SPARK-21422.
---
 assembly/pom.xml               |  6 +++++
 dev/deps/spark-deps-hadoop-2.6 |  3 +++
 dev/deps/spark-deps-hadoop-2.7 |  3 +++
 pom.xml                        | 44 ++++++++++++++++++++++++++++++++++
 sql/core/pom.xml               | 10 ++++++++
 5 files changed, 66 insertions(+)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 464af16e46..cd8366a175 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -220,6 +220,12 @@
         <hive.deps.scope>provided</hive.deps.scope>
       </properties>
     </profile>
+    <profile>
+      <id>orc-provided</id>
+      <properties>
+        <orc.deps.scope>provided</orc.deps.scope>
+      </properties>
+    </profile>
     <profile>
       <id>parquet-provided</id>
       <properties>
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 83070a906d..01af2c75b0 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -148,6 +149,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 5481e255a5..69f3a4bb60 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -149,6 +150,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar
diff --git a/pom.xml b/pom.xml
index d544894b54..c0df3ef0fe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -132,6 +132,8 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
+    <orc.version>1.4.0</orc.version>
+    <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
@@ -208,6 +210,7 @@
     <flume.deps.scope>compile</flume.deps.scope>
     <hadoop.deps.scope>compile</hadoop.deps.scope>
     <hive.deps.scope>compile</hive.deps.scope>
+    <orc.deps.scope>compile</orc.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
 
@@ -1695,6 +1698,44 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-core</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-mapreduce</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
       <dependency>
         <groupId>org.apache.parquet</groupId>
         <artifactId>parquet-column</artifactId>
@@ -2727,6 +2768,9 @@
     <profile>
       <id>hive-provided</id>
     </profile>
+    <profile>
+      <id>orc-provided</id>
+    </profile>
     <profile>
       <id>parquet-provided</id>
     </profile>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index a16411ec30..9a3cacbe38 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -86,6 +86,16 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-core</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-mapreduce</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
-- 
GitLab