Skip to content
Snippets Groups Projects
Commit 27b3f3f0 authored by Stephen Haberman's avatar Stephen Haberman
Browse files

Handle slaveLost before slaveIdToHost knows about it.

parent d1de9d7d
No related branches found
No related tags found
No related merge requests found
...@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext) ...@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
def slaveLost(slaveId: String, reason: ExecutorLossReason) { def slaveLost(slaveId: String, reason: ExecutorLossReason) {
var failedHost: Option[String] = None var failedHost: Option[String] = None
synchronized { synchronized {
val host = slaveIdToHost(slaveId) slaveIdToHost.get(slaveId) match {
if (hostsAlive.contains(host)) { case Some(host) =>
logError("Lost an executor on " + host + ": " + reason) if (hostsAlive.contains(host)) {
slaveIdsWithExecutors -= slaveId logError("Lost an executor on " + host + ": " + reason)
hostsAlive -= host slaveIdsWithExecutors -= slaveId
activeTaskSetsQueue.foreach(_.hostLost(host)) hostsAlive -= host
failedHost = Some(host) activeTaskSetsQueue.foreach(_.hostLost(host))
} else { failedHost = Some(host)
// We may get multiple slaveLost() calls with different loss reasons. For example, one } else {
// may be triggered by a dropped connection from the slave while another may be a report // We may get multiple slaveLost() calls with different loss reasons. For example, one
// of executor termination from Mesos. We produce log messages for both so we eventually // may be triggered by a dropped connection from the slave while another may be a report
// report the termination reason. // of executor termination from Mesos. We produce log messages for both so we eventually
logError("Lost an executor on " + host + " (already removed): " + reason) // report the termination reason.
logError("Lost an executor on " + host + " (already removed): " + reason)
}
case None =>
// We were told about a slave being lost before we could even allocate work to it
logError("Lost slave " + slaveId + " (no work assigned yet)")
} }
} }
if (failedHost != None) { if (failedHost != None) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment