Skip to content
Snippets Groups Projects
Commit acd4ac7c authored by Sandy Ryza's avatar Sandy Ryza Committed by Thomas Graves
Browse files

SPARK-3837. Warn when YARN kills containers for exceeding memory limits

I triggered the issue and verified the message gets printed on a pseudo-distributed cluster.

Author: Sandy Ryza <sandy@cloudera.com>

Closes #2744 from sryza/sandy-spark-3837 and squashes the following commits:

858a268 [Sandy Ryza] Review feedback
c937f00 [Sandy Ryza] SPARK-3837. Warn when YARN kills containers for exceeding memory limits
parent 58a6077e
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,7 @@ package org.apache.spark.deploy.yarn
import java.util.{List => JList}
import java.util.concurrent._
import java.util.concurrent.atomic.AtomicInteger
import java.util.regex.Pattern
import scala.collection.JavaConversions._
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
......@@ -375,12 +376,22 @@ private[yarn] abstract class YarnAllocator(
logInfo("Completed container %s (state: %s, exit status: %s)".format(
containerId,
completedContainer.getState,
completedContainer.getExitStatus()))
completedContainer.getExitStatus))
// Hadoop 2.2.X added a ContainerExitStatus we should switch to use
// there are some exit status' we shouldn't necessarily count against us, but for
// now I think its ok as none of the containers are expected to exit
if (completedContainer.getExitStatus() != 0) {
logInfo("Container marked as failed: " + containerId)
if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
logWarning(memLimitExceededLogMessage(
completedContainer.getDiagnostics,
VMEM_EXCEEDED_PATTERN))
} else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
logWarning(memLimitExceededLogMessage(
completedContainer.getDiagnostics,
PMEM_EXCEEDED_PATTERN))
} else if (completedContainer.getExitStatus != 0) {
logInfo("Container marked as failed: " + containerId +
". Exit status: " + completedContainer.getExitStatus +
". Diagnostics: " + completedContainer.getDiagnostics)
numExecutorsFailed.incrementAndGet()
}
}
......@@ -428,6 +439,19 @@ private[yarn] abstract class YarnAllocator(
}
}
private val MEM_REGEX = "[0-9.]+ [KMG]B"
private val PMEM_EXCEEDED_PATTERN =
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
private val VMEM_EXCEEDED_PATTERN =
Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
val matcher = pattern.matcher(diagnostics)
val diag = if (matcher.find()) " " + matcher.group() + "." else ""
("Container killed by YARN for exceeding memory limits." + diag
+ " Consider boosting spark.yarn.executor.memoryOverhead.")
}
protected def allocatedContainersOnHost(host: String): Int = {
var retval = 0
allocatedHostToContainersMap.synchronized {
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.deploy.yarn
import org.apache.spark.deploy.yarn.MemLimitLogger._
import org.scalatest.FunSuite
class YarnAllocatorSuite extends FunSuite {
test("memory exceeded diagnostic regexes") {
val diagnostics =
"Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +
"beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
"5.8 GB of 4.2 GB virtual memory used. Killing container."
val vmemMsg = memLimitExceededLogMessage(diagnostics, VMEM_EXCEEDED_PATTERN)
val pmemMsg = memLimitExceededLogMessage(diagnostics, PMEM_EXCEEDED_PATTERN)
assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment