Skip to content
Snippets Groups Projects
Commit 3fb302c0 authored by Matei Zaharia's avatar Matei Zaharia
Browse files

Merge pull request #205 from kayousterhout/logging

Added logging of scheduler delays to UI

This commit adds two metrics to the UI:

1) The time to get task results, if they're fetched remotely

2) The scheduler delay.  When the scheduler starts getting overwhelmed (because it can't keep up with the rate at which tasks are being submitted), the result is that tasks get delayed on the tail-end: the message from the worker saying that the task has completed ends up in a long queue and takes a while to be processed by the scheduler.  This commit records that delay in the UI so that users can tell when the scheduler is becoming the bottleneck.
parents 87676a6a 58b3aff9
No related branches found
No related tags found
No related merge requests found
......@@ -60,11 +60,13 @@ private[spark] class StagePage(parent: JobProgressUI) {
var activeTime = 0L
listener.stageIdToTasksActive(stageId).foreach(activeTime += _.timeRunning(now))
val finishedTasks = listener.stageIdToTaskInfos(stageId).filter(_._1.finished)
val summary =
<div>
<ul class="unstyled">
<li>
<strong>CPU time: </strong>
<strong>Total duration across all tasks: </strong>
{parent.formatDuration(listener.stageIdToTime.getOrElse(stageId, 0L) + activeTime)}
</li>
{if (hasShuffleRead)
......@@ -104,6 +106,33 @@ private[spark] class StagePage(parent: JobProgressUI) {
val serviceQuantiles = "Duration" +: Distribution(serviceTimes).get.getQuantiles().map(
ms => parent.formatDuration(ms.toLong))
val gettingResultTimes = validTasks.map{case (info, metrics, exception) =>
if (info.gettingResultTime > 0) {
(info.finishTime - info.gettingResultTime).toDouble
} else {
0.0
}
}
val gettingResultQuantiles = ("Time spent fetching task results" +:
Distribution(gettingResultTimes).get.getQuantiles().map(
millis => parent.formatDuration(millis.toLong)))
// The scheduler delay includes the network delay to send the task to the worker
// machine and to send back the result (but not the time to fetch the task result,
// if it needed to be fetched from the block manager on the worker).
val schedulerDelays = validTasks.map{case (info, metrics, exception) =>
val totalExecutionTime = {
if (info.gettingResultTime > 0) {
(info.gettingResultTime - info.launchTime).toDouble
} else {
(info.finishTime - info.launchTime).toDouble
}
}
totalExecutionTime - metrics.get.executorRunTime
}
val schedulerDelayQuantiles = ("Scheduler delay" +:
Distribution(schedulerDelays).get.getQuantiles().map(
millis => parent.formatDuration(millis.toLong)))
def getQuantileCols(data: Seq[Double]) =
Distribution(data).get.getQuantiles().map(d => Utils.bytesToString(d.toLong))
......@@ -119,7 +148,10 @@ private[spark] class StagePage(parent: JobProgressUI) {
}
val shuffleWriteQuantiles = "Shuffle Write" +: getQuantileCols(shuffleWriteSizes)
val listings: Seq[Seq[String]] = Seq(serviceQuantiles,
val listings: Seq[Seq[String]] = Seq(
serviceQuantiles,
gettingResultQuantiles,
schedulerDelayQuantiles,
if (hasShuffleRead) shuffleReadQuantiles else Nil,
if (hasShuffleWrite) shuffleWriteQuantiles else Nil)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment