Newer
Older
#!/usr/bin/env python2
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
Josh Rosen
committed
from __future__ import print_function
import itertools
from optparse import OptionParser
import os
import re
import sys
import subprocess
from collections import namedtuple
Josh Rosen
committed
from sparktestsupport import SPARK_HOME, USER_HOME
from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
import sparktestsupport.modules as modules
# -------------------------------------------------------------------------------------------------
Josh Rosen
committed
# Functions for traversing module dependency graph
# -------------------------------------------------------------------------------------------------
def determine_modules_for_files(filenames):
"""
Given a list of filenames, return the set of modules that contain those files.
If a file is not associated with a more specific submodule, then this method will consider that
file to belong to the 'root' module.
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
Josh Rosen
committed
['pyspark-core', 'sql']
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
['root']
"""
changed_modules = set()
for filename in filenames:
matched_at_least_one_module = False
Josh Rosen
committed
for module in modules.all_modules:
if module.contains_file(filename):
changed_modules.add(module)
matched_at_least_one_module = True
if not matched_at_least_one_module:
Josh Rosen
committed
changed_modules.add(modules.root)
return changed_modules
def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
"""
Given a git commit and target ref, use the set of files changed in the diff in order to
determine which modules' tests should be run.
>>> [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
['graphx']
>>> 'root' in [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
True
"""
if target_branch is None and target_ref is None:
raise AttributeError("must specify either target_branch or target_ref")
elif target_branch is not None and target_ref is not None:
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
else:
diff_target = target_ref
Josh Rosen
committed
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
universal_newlines=True)
# Remove any empty strings
return [f for f in raw_output.split('\n') if f]
zsxwing
committed
def setup_test_environ(environ):
print("[info] Setup the following environment variables for tests: ")
for (k, v) in environ.items():
print("%s=%s" % (k, v))
os.environ[k] = v
def determine_modules_to_test(changed_modules):
"""
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.
Josh Rosen
committed
>>> sorted(x.name for x in determine_modules_to_test([modules.root]))
['root']
Josh Rosen
committed
>>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
['examples', 'graphx']
Josh Rosen
committed
>>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
>>> x # doctest: +NORMALIZE_WHITESPACE
['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
"""
# If we're going to have to run all of the tests, then we can just short-circuit
# and return 'root'. No module depends on root, so if it appears then it will be
# in changed_modules.
Josh Rosen
committed
if modules.root in changed_modules:
return [modules.root]
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
return modules_to_test.union(set(changed_modules))
# -------------------------------------------------------------------------------------------------
# Functions for working with subprocesses and shell tools
# -------------------------------------------------------------------------------------------------
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def get_error_codes(err_code_file):
"""Function to retrieve all block numbers from the `run-tests-codes.sh`
file to maintain backwards compatibility with the `run-tests-jenkins`
script"""
with open(err_code_file, 'r') as f:
err_codes = [e.split()[1].strip().split('=')
for e in f if e.startswith("readonly")]
return dict(err_codes)
ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
def determine_java_executable():
"""Will return the path of the java executable that will be used by Spark's
tests or `None`"""
# Any changes in the way that Spark's build detects java must be reflected
# here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
# the `java` executable on the path
java_home = os.environ.get("JAVA_HOME")
# check if there is an executable at $JAVA_HOME/bin/java
java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
# if the java_exe wasn't set, check for a `java` version on the $PATH
return java_exe if java_exe else which("java")
JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
def determine_java_version(java_exe):
"""Given a valid java executable will return its version in named tuple format
with accessors '.major', '.minor', '.patch', '.update'"""
raw_output = subprocess.check_output([java_exe, "-version"],
Josh Rosen
committed
stderr=subprocess.STDOUT,
universal_newlines=True)
raw_output_lines = raw_output.split('\n')
# find raw version string, eg 'java version "1.8.0_25"'
raw_version_str = next(x for x in raw_output_lines if " version " in x)
version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
version, update = version_str.split('_') # eg ['1.8.0', '25']
# map over the values and convert them to integers
version_info = [int(x) for x in version.split('.') + [update]]
return JavaVersion(major=version_info[0],
minor=version_info[1],
patch=version_info[2],
update=version_info[3])
# -------------------------------------------------------------------------------------------------
# Functions for running the other build and test scripts
# -------------------------------------------------------------------------------------------------
def set_title_and_block(title, err_block):
os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
line_str = '=' * 72
Josh Rosen
committed
print('')
print(line_str)
print(title)
print(line_str)
def run_apache_rat_checks():
set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
def run_scala_style_checks():
set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
def run_python_style_checks():
set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
def build_spark_documentation():
set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
os.environ["PRODUCTION"] = "1 jekyll build"
os.chdir(os.path.join(SPARK_HOME, "docs"))
jekyll_bin = which("jekyll")
if not jekyll_bin:
Brennon York
committed
print("[error] Cannot find a version of `jekyll` on the system; please",
Josh Rosen
committed
" install one and retry to build documentation.")
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
else:
run_cmd([jekyll_bin, "build"])
os.chdir(SPARK_HOME)
def exec_maven(mvn_args=()):
"""Will call Maven in the current directory with the list of mvn_args passed
in and returns the subprocess for any further processing"""
run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
def exec_sbt(sbt_args=()):
"""Will call SBT in the current directory with the list of mvn_args passed
in and returns the subprocess for any further processing"""
sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
"^.*[warn].*Merging" + "|" +
"^.*[info].*Including")
# NOTE: echo "q" is needed because sbt on encountering a build file
# with failure (either resolution or compilation) prompts the user for
# input either q, r, etc to quit or retry. This echo is there to make it
# not block.
echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
sbt_proc = subprocess.Popen(sbt_cmd,
stdin=echo_proc.stdout,
stdout=subprocess.PIPE)
echo_proc.wait()
for line in iter(sbt_proc.stdout.readline, ''):
if not sbt_output_filter.match(line):
Josh Rosen
committed
print(line, end='')
retcode = sbt_proc.wait()
if retcode > 0:
exit_from_command_with_retcode(sbt_cmd, retcode)
def get_hadoop_profiles(hadoop_version):
"""
For the given Hadoop version tag, return a list of SBT profile flags for
building and testing against that Hadoop version.
"""
sbt_maven_hadoop_profiles = {
Cheng Lian
committed
"hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"],
"hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
"hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
"hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
"hadoop2.6": ["-Pyarn", "-Phadoop-2.6"],
}
if hadoop_version in sbt_maven_hadoop_profiles:
return sbt_maven_hadoop_profiles[hadoop_version]
else:
Brennon York
committed
print("[error] Could not find", hadoop_version, "in the list. Valid options",
Josh Rosen
committed
" are", sbt_maven_hadoop_profiles.keys())
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
def build_spark_maven(hadoop_version):
# Enable all of the profiles for the build:
Josh Rosen
committed
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
mvn_goals = ["clean", "package", "-DskipTests"]
profiles_and_goals = build_profiles + mvn_goals
print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ",
Josh Rosen
committed
" ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
def build_spark_sbt(hadoop_version):
# Enable all of the profiles for the build:
Josh Rosen
committed
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
sbt_goals = ["package",
"assembly/assembly",
"streaming-kafka-assembly/assembly",
"streaming-flume-assembly/assembly",
"streaming-mqtt-assembly/assembly",
"streaming-mqtt/test:assembly",
"streaming-kinesis-asl-assembly/assembly"]
profiles_and_goals = build_profiles + sbt_goals
print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ",
Josh Rosen
committed
" ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
def build_apache_spark(build_tool, hadoop_version):
"""Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
`maven`). Defaults to using `sbt`."""
set_title_and_block("Building Spark", "BLOCK_BUILD")
rm_r("lib_managed")
if build_tool == "maven":
build_spark_maven(hadoop_version)
else:
build_spark_sbt(hadoop_version)
def detect_binary_inop_with_mima():
set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
def run_scala_tests_maven(test_profiles):
mvn_test_goals = ["test", "--fail-at-end"]
profiles_and_goals = test_profiles + mvn_test_goals
Brennon York
committed
print("[info] Running Spark tests using Maven with these arguments: ",
Josh Rosen
committed
" ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
def run_scala_tests_sbt(test_modules, test_profiles):
sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
if not sbt_test_goals:
return
profiles_and_goals = test_profiles + list(sbt_test_goals)
Brennon York
committed
print("[info] Running Spark tests using SBT with these arguments: ",
Josh Rosen
committed
" ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
def run_scala_tests(build_tool, hadoop_version, test_modules):
"""Function to properly execute all tests passed in as a set from the
`determine_test_suites` function"""
set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
test_modules = set(test_modules)
test_profiles = get_hadoop_profiles(hadoop_version) + \
list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
if build_tool == "maven":
run_scala_tests_maven(test_profiles)
else:
run_scala_tests_sbt(test_modules, test_profiles)
def run_python_tests(test_modules, parallelism):
set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
Josh Rosen
committed
command = [os.path.join(SPARK_HOME, "python", "run-tests")]
if test_modules != [modules.root]:
command.append("--modules=%s" % ','.join(m.name for m in test_modules))
command.append("--parallelism=%i" % parallelism)
Josh Rosen
committed
run_cmd(command)
def run_sparkr_tests():
set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
if which("R"):
run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
else:
Josh Rosen
committed
print("Ignoring SparkR tests as R was not found in PATH")
def parse_opts():
parser = OptionParser(
prog="run-tests"
)
parser.add_option(
"-p", "--parallelism", type="int", default=4,
help="The number of suites to test in parallel (default %default)"
)
(opts, args) = parser.parse_args()
if args:
parser.error("Unsupported arguments: %s" % ' '.join(args))
if opts.parallelism < 1:
parser.error("Parallelism cannot be less than 1")
return opts
def main():
# Ensure the user home directory (HOME) is valid and is an absolute directory
if not USER_HOME or not os.path.isabs(USER_HOME):
Brennon York
committed
print("[error] Cannot determine your home directory as an absolute path;",
Josh Rosen
committed
" ensure the $HOME environment variable is set properly.")
sys.exit(1)
os.chdir(SPARK_HOME)
rm_r(os.path.join(SPARK_HOME, "work"))
rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
java_exe = determine_java_executable()
if not java_exe:
Brennon York
committed
print("[error] Cannot find a version of `java` on the system; please",
Josh Rosen
committed
" install one and retry.")
sys.exit(2)
java_version = determine_java_version(java_exe)
if java_version.minor < 8:
Josh Rosen
committed
print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
if os.environ.get("AMPLAB_JENKINS"):
# if we're on the Amplab Jenkins build servers setup variables
# to reflect the environment settings
build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
test_env = "amplab_jenkins"
# add path for Python3 in Jenkins if we're calling from a Jenkins machine
os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
else:
# else we're running locally and can use local settings
build_tool = "sbt"
hadoop_version = "hadoop2.3"
test_env = "local"
Josh Rosen
committed
print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
"under environment", test_env)
changed_modules = None
changed_files = None
if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
target_branch = os.environ["ghprbTargetBranch"]
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
if not changed_modules:
Josh Rosen
committed
changed_modules = [modules.root]
print("[info] Found the following changed modules:",
", ".join(x.name for x in changed_modules))
zsxwing
committed
# setup environment variables
# note - the 'root' module doesn't collect environment variables for all modules. Because the
# environment variables should not be set if a module is not changed, even if running the 'root'
# module. So here we should use changed_modules rather than test_modules.
test_environ = {}
for m in changed_modules:
test_environ.update(m.environ)
setup_test_environ(test_environ)
test_modules = determine_modules_to_test(changed_modules)
# license checks
run_apache_rat_checks()
# style checks
if not changed_files or any(f.endswith(".scala") for f in changed_files):
run_scala_style_checks()
if not changed_files or any(f.endswith(".py") for f in changed_files):
run_python_style_checks()
# determine if docs were changed and if we're inside the amplab environment
# note - the below commented out until *all* Jenkins workers can get `jekyll` installed
# if "DOCS" in changed_modules and test_env == "amplab_jenkins":
# build_spark_documentation()
# spark build
build_apache_spark(build_tool, hadoop_version)
# backwards compatibility checks
detect_binary_inop_with_mima()
# run the test suites
run_scala_tests(build_tool, hadoop_version, test_modules)
Josh Rosen
committed
modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
if modules_with_python_tests:
run_python_tests(modules_with_python_tests, opts.parallelism)
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()
def _test():
import doctest
failure_count = doctest.testmod()[0]
if failure_count:
exit(-1)
if __name__ == "__main__":
_test()