Skip to content
Snippets Groups Projects
Commit 6b5980da authored by Reynold Xin's avatar Reynold Xin
Browse files

Set a limited number of retry in standalone deploy mode.

parent 9a449e00
No related branches found
No related tags found
No related merge requests found
......@@ -31,4 +31,13 @@ class JobInfo(val id: String, val desc: JobDescription, val submitDate: Date, va
}
def coresLeft: Int = desc.cores - coresGranted
private var _retryCount = 0
def retryCount = _retryCount
def incrementRetryCount = {
_retryCount += 1
_retryCount
}
}
......@@ -4,4 +4,6 @@ object JobState extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED")
type JobState = Value
val WAITING, RUNNING, FINISHED, FAILED = Value
val MAX_NUM_RETRY = 10
}
package spark.deploy.master
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
import akka.actor._
import spark.{Logging, Utils}
import spark.util.AkkaUtils
import akka.actor.Terminated
import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientDisconnected, RemoteClientShutdown}
import java.text.SimpleDateFormat
import java.util.Date
import akka.remote.RemoteClientLifeCycleEvent
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
import spark.deploy._
import akka.remote.RemoteClientShutdown
import akka.remote.RemoteClientDisconnected
import spark.deploy.RegisterWorker
import spark.deploy.RegisterWorkerFailed
import akka.actor.Terminated
import spark.{Logging, SparkException, Utils}
import spark.util.AkkaUtils
class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For job IDs
......@@ -81,12 +80,22 @@ class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
exec.state = state
exec.job.actor ! ExecutorUpdated(execId, state, message)
if (ExecutorState.isFinished(state)) {
val jobInfo = idToJob(jobId)
// Remove this executor from the worker and job
logInfo("Removing executor " + exec.fullId + " because it is " + state)
idToJob(jobId).removeExecutor(exec)
jobInfo.removeExecutor(exec)
exec.worker.removeExecutor(exec)
// TODO: the worker would probably want to restart the executor a few times
schedule()
// Only retry certain number of times so we don't go into an infinite loop.
if (jobInfo.incrementRetryCount <= JobState.MAX_NUM_RETRY) {
schedule()
} else {
val e = new SparkException("Job %s wth ID %s failed %d times.".format(
jobInfo.desc.name, jobInfo.id, jobInfo.retryCount))
logError(e.getMessage, e)
throw e
//System.exit(1)
}
}
}
case None =>
......@@ -112,7 +121,7 @@ class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
addressToWorker.get(address).foreach(removeWorker)
addressToJob.get(address).foreach(removeJob)
}
case RequestMasterState => {
sender ! MasterState(ip + ":" + port, workers.toList, jobs.toList, completedJobs.toList)
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment