Skip to content
Snippets Groups Projects
Commit 092e2f15 authored by Mark Hamstra's avatar Mark Hamstra Committed by Andrew Or
Browse files

SPARK-2425 Don't kill a still-running Application because of some misbehaving Executors

Introduces a LOADING -> RUNNING ApplicationState transition and prevents Master from removing an Application with RUNNING Executors.

Two basic changes: 1) Instead of allowing MAX_NUM_RETRY abnormal Executor exits over the entire lifetime of the Application, allow that many since any Executor successfully began running the Application; 2) Don't remove the Application while Master still thinks that there are RUNNING Executors.

This should be fine as long as the ApplicationInfo doesn't believe any Executors are forever RUNNING when they are not.  I think that any non-RUNNING Executors will eventually no longer be RUNNING in Master's accounting, but another set of eyes should confirm that.  This PR also doesn't try to detect which nodes have gone rogue or to kill off bad Workers, so repeatedly failing Executors will continue to fail and fill up log files with failure reports as long as the Application keeps running.

Author: Mark Hamstra <markhamstra@gmail.com>

Closes #1360 from markhamstra/SPARK-2425 and squashes the following commits:

f099c0b [Mark Hamstra] Reuse appInfo
b2b7b25 [Mark Hamstra] Moved 'Application failed' logging
bdd0928 [Mark Hamstra] switched to string interpolation
1dd591b [Mark Hamstra] SPARK-2425 introduce LOADING -> RUNNING ApplicationState transition and prevent Master from removing Application with RUNNING Executors
parent 2b7ab814
No related branches found
No related tags found
No related merge requests found
...@@ -96,11 +96,13 @@ private[spark] class ApplicationInfo( ...@@ -96,11 +96,13 @@ private[spark] class ApplicationInfo(
def retryCount = _retryCount def retryCount = _retryCount
def incrementRetryCount = { def incrementRetryCount() = {
_retryCount += 1 _retryCount += 1
_retryCount _retryCount
} }
def resetRetryCount() = _retryCount = 0
def markFinished(endState: ApplicationState.Value) { def markFinished(endState: ApplicationState.Value) {
state = endState state = endState
endTime = System.currentTimeMillis() endTime = System.currentTimeMillis()
......
...@@ -296,28 +296,34 @@ private[spark] class Master( ...@@ -296,28 +296,34 @@ private[spark] class Master(
val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
execOption match { execOption match {
case Some(exec) => { case Some(exec) => {
val appInfo = idToApp(appId)
exec.state = state exec.state = state
if (state == ExecutorState.RUNNING) { appInfo.resetRetryCount() }
exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus) exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus)
if (ExecutorState.isFinished(state)) { if (ExecutorState.isFinished(state)) {
val appInfo = idToApp(appId)
// Remove this executor from the worker and app // Remove this executor from the worker and app
logInfo("Removing executor " + exec.fullId + " because it is " + state) logInfo(s"Removing executor ${exec.fullId} because it is $state")
appInfo.removeExecutor(exec) appInfo.removeExecutor(exec)
exec.worker.removeExecutor(exec) exec.worker.removeExecutor(exec)
val normalExit = exitStatus.exists(_ == 0) val normalExit = exitStatus == Some(0)
// Only retry certain number of times so we don't go into an infinite loop. // Only retry certain number of times so we don't go into an infinite loop.
if (!normalExit && appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) { if (!normalExit) {
schedule() if (appInfo.incrementRetryCount() < ApplicationState.MAX_NUM_RETRY) {
} else if (!normalExit) { schedule()
logError("Application %s with ID %s failed %d times, removing it".format( } else {
appInfo.desc.name, appInfo.id, appInfo.retryCount)) val execs = appInfo.executors.values
removeApplication(appInfo, ApplicationState.FAILED) if (!execs.exists(_.state == ExecutorState.RUNNING)) {
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
s"${appInfo.retryCount} times; removing it")
removeApplication(appInfo, ApplicationState.FAILED)
}
}
} }
} }
} }
case None => case None =>
logWarning("Got status update for unknown executor " + appId + "/" + execId) logWarning(s"Got status update for unknown executor $appId/$execId")
} }
} }
......
...@@ -159,6 +159,8 @@ private[spark] class ExecutorRunner( ...@@ -159,6 +159,8 @@ private[spark] class ExecutorRunner(
Files.write(header, stderr, Charsets.UTF_8) Files.write(header, stderr, Charsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf) stderrAppender = FileAppender(process.getErrorStream, stderr, conf)
state = ExecutorState.RUNNING
worker ! ExecutorStateChanged(appId, execId, state, None, None)
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown) // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
// or with nonzero exit code // or with nonzero exit code
val exitCode = process.waitFor() val exitCode = process.waitFor()
......
...@@ -234,7 +234,7 @@ private[spark] class Worker( ...@@ -234,7 +234,7 @@ private[spark] class Worker(
try { try {
logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_, val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_,
self, workerId, host, sparkHome, workDir, akkaUrl, conf, ExecutorState.RUNNING) self, workerId, host, sparkHome, workDir, akkaUrl, conf, ExecutorState.LOADING)
executors(appId + "/" + execId) = manager executors(appId + "/" + execId) = manager
manager.start() manager.start()
coresUsed += cores_ coresUsed += cores_
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment