From 2bc7c96d61a51bd458ba04e9d318640ddada559d Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 28 Mar 2016 17:03:21 -0700
Subject: [PATCH] [SPARK-13447][YARN][CORE] Clean the stale states for AM
 failure and restart situation

## What changes were proposed in this pull request?

This is a follow-up fix of #9963, in #9963 we handle this stale states clean-up work only for dynamic allocation enabled scenario. Here we should also clean the states in `CoarseGrainedSchedulerBackend` for dynamic allocation disabled scenario.

Please review, CC andrewor14 lianhuiwang , thanks a lot.

## How was this patch tested?

Run the unit test locally, also with integration test manually.

Author: jerryshao <sshao@hortonworks.com>

Closes #11366 from jerryshao/SPARK-13447.
---
 .../CoarseGrainedSchedulerBackend.scala       | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index b7919efc4b..eb4f5331d6 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -356,20 +356,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
   /**
    * Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only
-   * be called in the yarn-client mode when AM re-registers after a failure, also dynamic
-   * allocation is enabled.
+   * be called in the yarn-client mode when AM re-registers after a failure.
    * */
   protected def reset(): Unit = synchronized {
-    if (Utils.isDynamicAllocationEnabled(conf)) {
-      numPendingExecutors = 0
-      executorsPendingToRemove.clear()
-
-      // Remove all the lingering executors that should be removed but not yet. The reason might be
-      // because (1) disconnected event is not yet received; (2) executors die silently.
-      executorDataMap.toMap.foreach { case (eid, _) =>
-        driverEndpoint.askWithRetry[Boolean](
-          RemoveExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered.")))
-      }
+    numPendingExecutors = 0
+    executorsPendingToRemove.clear()
+
+    // Remove all the lingering executors that should be removed but not yet. The reason might be
+    // because (1) disconnected event is not yet received; (2) executors die silently.
+    executorDataMap.toMap.foreach { case (eid, _) =>
+      driverEndpoint.askWithRetry[Boolean](
+        RemoveExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered.")))
     }
   }
 
-- 
GitLab