From 0a38637d05d2338503ecceacfb911a6da6d49538 Mon Sep 17 00:00:00 2001 From: Reynold Xin <rxin@databricks.com> Date: Mon, 21 Dec 2015 22:15:52 -0800 Subject: [PATCH] [SPARK-11807] Remove support for Hadoop < 2.2 i.e. Hadoop 1 and Hadoop 2.0 Author: Reynold Xin <rxin@databricks.com> Closes #10404 from rxin/SPARK-11807. --- .../deploy/history/FsHistoryProvider.scala | 10 +--------- .../mapreduce/SparkHadoopMapReduceUtil.scala | 17 ++--------------- dev/create-release/release-build.sh | 3 --- dev/run-tests-jenkins.py | 4 ---- dev/run-tests.py | 2 -- docs/building-spark.md | 18 ++++-------------- make-distribution.sh | 2 +- pom.xml | 13 ------------- sql/README.md | 2 +- 9 files changed, 9 insertions(+), 62 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 718efc4f3b..6e91d73b6e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -663,16 +663,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // For testing. private[history] def isFsInSafeMode(dfs: DistributedFileSystem): Boolean = { - val hadoop1Class = "org.apache.hadoop.hdfs.protocol.FSConstants$SafeModeAction" val hadoop2Class = "org.apache.hadoop.hdfs.protocol.HdfsConstants$SafeModeAction" - val actionClass: Class[_] = - try { - getClass().getClassLoader().loadClass(hadoop2Class) - } catch { - case _: ClassNotFoundException => - getClass().getClassLoader().loadClass(hadoop1Class) - } - + val actionClass: Class[_] = getClass().getClassLoader().loadClass(hadoop2Class) val action = actionClass.getField("SAFEMODE_GET").get(null) val method = dfs.getClass().getMethod("setSafeMode", action.getClass()) method.invoke(dfs, action).asInstanceOf[Boolean] diff --git a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala index 943ebcb7bd..82d807fad8 100644 --- a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala +++ b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala @@ -26,17 +26,13 @@ import org.apache.spark.util.Utils private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { - val klass = firstAvailableClass( - "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn - "org.apache.hadoop.mapreduce.JobContext") // hadoop1 + val klass = Utils.classForName("org.apache.hadoop.mapreduce.task.JobContextImpl") val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { - val klass = firstAvailableClass( - "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn - "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 + val klass = Utils.classForName("org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl") val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } @@ -69,13 +65,4 @@ trait SparkHadoopMapReduceUtil { } } } - - private def firstAvailableClass(first: String, second: String): Class[_] = { - try { - Utils.classForName(first) - } catch { - case e: ClassNotFoundException => - Utils.classForName(second) - } - } } diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index cb79e9eba0..b1895b16b1 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -166,9 +166,6 @@ if [[ "$1" == "package" ]]; then # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds # share the same Zinc server. - make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & - make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & - make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" & diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 7aecea25b2..42afca0e52 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -163,10 +163,6 @@ def main(): if "test-maven" in ghprb_pull_title: os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven" # Switch the Hadoop profile based on the PR title: - if "test-hadoop1.0" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0" - if "test-hadoop2.0" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0" if "test-hadoop2.2" in ghprb_pull_title: os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2" if "test-hadoop2.3" in ghprb_pull_title: diff --git a/dev/run-tests.py b/dev/run-tests.py index 2d4e04c468..17ceba052b 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -301,8 +301,6 @@ def get_hadoop_profiles(hadoop_version): """ sbt_maven_hadoop_profiles = { - "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"], - "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"], diff --git a/docs/building-spark.md b/docs/building-spark.md index 3d38edbdad..785988902d 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -33,13 +33,13 @@ to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/ # Building a Runnable Distribution -To create a Spark distribution like those distributed by the -[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as -to be runnable, use `make-distribution.sh` in the project root directory. It can be configured +To create a Spark distribution like those distributed by the +[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as +to be runnable, use `make-distribution.sh` in the project root directory. It can be configured with Maven profile settings and so on like the direct Maven build. Example: ./make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn - + For more information on usage, run `./make-distribution.sh --help` # Setting up Maven's Memory Usage @@ -74,7 +74,6 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro <tr><th>Hadoop version</th><th>Profile required</th></tr> </thead> <tbody> - <tr><td>1.x to 2.1.x</td><td>hadoop-1</td></tr> <tr><td>2.2.x</td><td>hadoop-2.2</td></tr> <tr><td>2.3.x</td><td>hadoop-2.3</td></tr> <tr><td>2.4.x</td><td>hadoop-2.4</td></tr> @@ -82,15 +81,6 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro </tbody> </table> -For Apache Hadoop versions 1.x, Cloudera CDH "mr1" distributions, and other Hadoop versions without YARN, use: - -{% highlight bash %} -# Apache Hadoop 1.2.1 -mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package - -# Cloudera CDH 4.2.0 with MapReduce v1 -mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package -{% endhighlight %} You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later. diff --git a/make-distribution.sh b/make-distribution.sh index e64ceb8024..351b9e7d89 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -58,7 +58,7 @@ while (( "$#" )); do --hadoop) echo "Error: '--hadoop' is no longer supported:" echo "Error: use Maven profiles and options -Dhadoop.version and -Dyarn.version instead." - echo "Error: Related profiles include hadoop-1, hadoop-2.2, hadoop-2.3 and hadoop-2.4." + echo "Error: Related profiles include hadoop-2.2, hadoop-2.3 and hadoop-2.4." exit_with_usage ;; --with-yarn) diff --git a/pom.xml b/pom.xml index 32918d6a74..284c219519 100644 --- a/pom.xml +++ b/pom.xml @@ -2442,19 +2442,6 @@ http://hadoop.apache.org/docs/ra.b.c/hadoop-project-dist/hadoop-common/dependency-analysis.html --> - <profile> - <id>hadoop-1</id> - <properties> - <hadoop.version>1.2.1</hadoop.version> - <protobuf.version>2.4.1</protobuf.version> - <hbase.version>0.98.7-hadoop1</hbase.version> - <avro.mapred.classifier>hadoop1</avro.mapred.classifier> - <codehaus.jackson.version>1.8.8</codehaus.jackson.version> - <akka.group>org.spark-project.akka</akka.group> - <akka.version>2.3.4-spark</akka.version> - </properties> - </profile> - <profile> <id>hadoop-2.2</id> <!-- SPARK-7249: Default hadoop profile. Uses global properties. --> diff --git a/sql/README.md b/sql/README.md index 63d4dac982..a13bdab6d4 100644 --- a/sql/README.md +++ b/sql/README.md @@ -20,7 +20,7 @@ If you are working with Hive 0.12.0, you will need to set several environmental ``` export HIVE_HOME="<path to>/hive/build/dist" export HIVE_DEV_HOME="<path to>/hive/" -export HADOOP_HOME="<path to>/hadoop-1.0.4" +export HADOOP_HOME="<path to>/hadoop" ``` If you are working with Hive 0.13.1, the following steps are needed: -- GitLab