From 8c54f1eb71d7338d3bd8224a57a293a2e7875252 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun <dongjoon@apache.org> Date: Tue, 15 Aug 2017 23:00:13 -0700 Subject: [PATCH] [SPARK-21422][BUILD] Depend on Apache ORC 1.4.0 ## What changes were proposed in this pull request? Like Parquet, this PR aims to depend on the latest Apache ORC 1.4 for Apache Spark 2.3. There are key benefits for Apache ORC 1.4. - Stability: Apache ORC 1.4.0 has many fixes and we can depend on ORC community more. - Maintainability: Reduce the Hive dependency and can remove old legacy code later. Later, we can get the following two key benefits by adding new ORCFileFormat in SPARK-20728 (#17980), too. - Usability: User can use ORC data sources without hive module, i.e, -Phive. - Speed: Use both Spark ColumnarBatch and ORC RowBatch together. This will be faster than the current implementation in Spark. ## How was this patch tested? Pass the jenkins. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #18640 from dongjoon-hyun/SPARK-21422. --- assembly/pom.xml | 6 +++++ dev/deps/spark-deps-hadoop-2.6 | 3 +++ dev/deps/spark-deps-hadoop-2.7 | 3 +++ pom.xml | 44 ++++++++++++++++++++++++++++++++++ sql/core/pom.xml | 10 ++++++++ 5 files changed, 66 insertions(+) diff --git a/assembly/pom.xml b/assembly/pom.xml index 464af16e46..cd8366a175 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -220,6 +220,12 @@ <hive.deps.scope>provided</hive.deps.scope> </properties> </profile> + <profile> + <id>orc-provided</id> + <properties> + <orc.deps.scope>provided</orc.deps.scope> + </properties> + </profile> <profile> <id>parquet-provided</id> <properties> diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 83070a906d..01af2c75b0 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar RoaringBitmap-0.5.11.jar ST4-4.0.4.jar activation-1.1.1.jar +aircompressor-0.3.jar antlr-2.7.7.jar antlr-runtime-3.4.jar antlr4-runtime-4.5.3.jar @@ -148,6 +149,8 @@ netty-3.9.9.Final.jar netty-all-4.0.43.Final.jar objenesis-2.1.jar opencsv-2.3.jar +orc-core-1.4.0-nohive.jar +orc-mapreduce-1.4.0-nohive.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.6.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 5481e255a5..69f3a4bb60 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar RoaringBitmap-0.5.11.jar ST4-4.0.4.jar activation-1.1.1.jar +aircompressor-0.3.jar antlr-2.7.7.jar antlr-runtime-3.4.jar antlr4-runtime-4.5.3.jar @@ -149,6 +150,8 @@ netty-3.9.9.Final.jar netty-all-4.0.43.Final.jar objenesis-2.1.jar opencsv-2.3.jar +orc-core-1.4.0-nohive.jar +orc-mapreduce-1.4.0-nohive.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.6.jar diff --git a/pom.xml b/pom.xml index d544894b54..c0df3ef0fe 100644 --- a/pom.xml +++ b/pom.xml @@ -132,6 +132,8 @@ <hive.version.short>1.2.1</hive.version.short> <derby.version>10.12.1.1</derby.version> <parquet.version>1.8.2</parquet.version> + <orc.version>1.4.0</orc.version> + <orc.classifier>nohive</orc.classifier> <hive.parquet.version>1.6.0</hive.parquet.version> <jetty.version>9.3.20.v20170531</jetty.version> <javaxservlet.version>3.1.0</javaxservlet.version> @@ -208,6 +210,7 @@ <flume.deps.scope>compile</flume.deps.scope> <hadoop.deps.scope>compile</hadoop.deps.scope> <hive.deps.scope>compile</hive.deps.scope> + <orc.deps.scope>compile</orc.deps.scope> <parquet.deps.scope>compile</parquet.deps.scope> <parquet.test.deps.scope>test</parquet.test.deps.scope> @@ -1695,6 +1698,44 @@ </exclusion> </exclusions> </dependency> + <dependency> + <groupId>org.apache.orc</groupId> + <artifactId>orc-core</artifactId> + <version>${orc.version}</version> + <classifier>${orc.classifier}</classifier> + <scope>${orc.deps.scope}</scope> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.orc</groupId> + <artifactId>orc-mapreduce</artifactId> + <version>${orc.version}</version> + <classifier>${orc.classifier}</classifier> + <scope>${orc.deps.scope}</scope> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.orc</groupId> + <artifactId>orc-core</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + </exclusion> + </exclusions> + </dependency> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-column</artifactId> @@ -2727,6 +2768,9 @@ <profile> <id>hive-provided</id> </profile> + <profile> + <id>orc-provided</id> + </profile> <profile> <id>parquet-provided</id> </profile> diff --git a/sql/core/pom.xml b/sql/core/pom.xml index a16411ec30..9a3cacbe38 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -86,6 +86,16 @@ <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.orc</groupId> + <artifactId>orc-core</artifactId> + <classifier>${orc.classifier}</classifier> + </dependency> + <dependency> + <groupId>org.apache.orc</groupId> + <artifactId>orc-mapreduce</artifactId> + <classifier>${orc.classifier}</classifier> + </dependency> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-column</artifactId> -- GitLab