Skip to content
Snippets Groups Projects
Commit 723a86b0 authored by Andrew Or's avatar Andrew Or Committed by Andrew Or
Browse files

[Release] Bring audit scripts up-to-date

This involves a few main changes:
- Log all output message to the log file. Previously the log file
  was not useful because it did not indicate progress.
- Remove hive-site.xml in sbt_hive_app to avoid interference
- Add the appropriate repositories for new dependencies
parent d7d54a44
No related branches found
No related tags found
No related merge requests found
......@@ -30,71 +30,84 @@ import sys
import time
import urllib2
# Fill in release details here:
RELEASE_URL = "http://people.apache.org/~pwendell/spark-1.0.0-rc1/"
RELEASE_KEY = "9E4FE3AF"
RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1006/"
RELEASE_VERSION = "1.0.0"
# Note: The following variables must be set before use!
RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/"
RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex
RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033"
RELEASE_VERSION = "1.1.1"
SCALA_VERSION = "2.10.4"
SCALA_BINARY_VERSION = "2.10"
#
# Do not set these
LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
LOG_FILE = open(LOG_FILE_NAME, 'w')
WORK_DIR = "/tmp/audit_%s" % int(time.time())
MAVEN_CMD = "mvn"
GPG_CMD = "gpg"
SBT_CMD = "sbt -Dsbt.log.noformat=true"
print "Starting tests, log output in %s. Test results printed below:" % LOG_FILE_NAME
# Track failures
# Track failures to print them at the end
failures = []
# Log a message. Use sparingly because this flushes every write.
def log(msg):
LOG_FILE.write(msg + "\n")
LOG_FILE.flush()
def log_and_print(msg):
print msg
log(msg)
# Prompt the user to delete the scratch directory used
def clean_work_files():
print "OK to delete scratch directory '%s'? (y/N): " % WORK_DIR
response = raw_input()
response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR)
if response == "y":
shutil.rmtree(WORK_DIR)
print "Should I delete the log output file '%s'? (y/N): " % LOG_FILE_NAME
response = raw_input()
if response == "y":
os.unlink(LOG_FILE_NAME)
# Run the given command and log its output to the log file
def run_cmd(cmd, exit_on_failure=True):
print >> LOG_FILE, "Running command: %s" % cmd
log("Running command: %s" % cmd)
ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE)
if ret != 0 and exit_on_failure:
print "Command failed: %s" % cmd
log_and_print("Command failed: %s" % cmd)
clean_work_files()
sys.exit(-1)
return ret
def run_cmd_with_output(cmd):
print >> sys.stderr, "Running command: %s" % cmd
log_and_print("Running command: %s" % cmd)
return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)
# Test if the given condition is successful
# If so, print the pass message; otherwise print the failure message
def test(cond, msg):
return passed(msg) if cond else failed(msg)
def test(bool, str):
if bool:
return passed(str)
failed(str)
def passed(str):
print "[PASSED] %s" % str
def failed(str):
failures.append(str)
print "[**FAILED**] %s" % str
def passed(msg):
log_and_print("[PASSED] %s" % msg)
def failed(msg):
failures.append(msg)
log_and_print("[**FAILED**] %s" % msg)
def get_url(url):
return urllib2.urlopen(url).read()
# If the path exists, prompt the user to delete it
# If the resource is not deleted, abort
def ensure_path_not_present(path):
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
print "Found %s locally." % full_path
response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ")
if response == "y":
shutil.rmtree(full_path)
else:
print "Abort."
sys.exit(-1)
log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION)
log_and_print("Log output can be found in %s" % LOG_FILE_NAME)
original_dir = os.getcwd()
......@@ -114,37 +127,36 @@ local_ivy_spark = "~/.ivy2/local/org.apache.spark"
cache_ivy_spark = "~/.ivy2/cache/org.apache.spark"
local_maven_kafka = "~/.m2/repository/org/apache/kafka"
local_maven_kafka = "~/.m2/repository/org/apache/spark"
def ensure_path_not_present(x):
if os.path.exists(os.path.expanduser(x)):
print "Please remove %s, it can interfere with testing published artifacts." % x
sys.exit(-1)
map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka])
# SBT build tests
log_and_print("==== Building SBT modules ====")
os.chdir("blank_sbt_build")
os.environ["SPARK_VERSION"] = RELEASE_VERSION
os.environ["SCALA_VERSION"] = SCALA_VERSION
os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY
os.environ["SPARK_AUDIT_MASTER"] = "local"
for module in modules:
log("==== Building module %s in SBT ====" % module)
os.environ["SPARK_MODULE"] = module
ret = run_cmd("sbt clean update", exit_on_failure=False)
test(ret == 0, "sbt build against '%s' module" % module)
ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False)
test(ret == 0, "SBT build against '%s' module" % module)
os.chdir(original_dir)
# SBT application tests
log_and_print("==== Building SBT applications ====")
for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
log("==== Building application %s in SBT ====" % app)
os.chdir(app)
ret = run_cmd("sbt clean run", exit_on_failure=False)
test(ret == 0, "sbt application (%s)" % app)
ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False)
test(ret == 0, "SBT application (%s)" % app)
os.chdir(original_dir)
# Maven build tests
os.chdir("blank_maven_build")
log_and_print("==== Building Maven modules ====")
for module in modules:
log("==== Building module %s in maven ====" % module)
cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
'-Dspark.module="%s" clean compile' %
(MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module))
......@@ -152,6 +164,8 @@ for module in modules:
test(ret == 0, "maven build against '%s' module" % module)
os.chdir(original_dir)
# Maven application tests
log_and_print("==== Building Maven applications ====")
os.chdir("maven_app_core")
mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
'-Dscala.binary.version="%s" clean compile '
......@@ -172,15 +186,14 @@ index_page = get_url(RELEASE_URL)
artifact_regex = r = re.compile("<a href=\"(.*.tgz)\">")
artifacts = r.findall(index_page)
# Verify artifact integrity
for artifact in artifacts:
print "==== Verifying download integrity for artifact: %s ====" % artifact
log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact)
artifact_url = "%s/%s" % (RELEASE_URL, artifact)
run_cmd("wget %s" % artifact_url)
key_file = "%s.asc" % artifact
run_cmd("wget %s" % artifact_url)
run_cmd("wget %s/%s" % (RELEASE_URL, key_file))
run_cmd("wget %s%s" % (artifact_url, ".sha"))
# Verify signature
......@@ -208,31 +221,17 @@ for artifact in artifacts:
os.chdir(WORK_DIR)
for artifact in artifacts:
print "==== Verifying build and tests for artifact: %s ====" % artifact
os.chdir(os.path.join(WORK_DIR, dir_name))
os.environ["MAVEN_OPTS"] = "-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g"
# Verify build
print "==> Running build"
run_cmd("sbt assembly")
passed("sbt build successful")
run_cmd("%s package -DskipTests" % MAVEN_CMD)
passed("Maven build successful")
# Verify tests
print "==> Performing unit tests"
run_cmd("%s test" % MAVEN_CMD)
passed("Tests successful")
os.chdir(WORK_DIR)
clean_work_files()
# Report result
log_and_print("\n")
if len(failures) == 0:
print "ALL TESTS PASSED"
log_and_print("*** ALL TESTS PASSED ***")
else:
print "SOME TESTS DID NOT PASS"
log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX")
for f in failures:
print f
log_and_print(" %s" % f)
os.chdir(original_dir)
# Clean up
clean_work_files()
log_and_print("|-------- Spark release audit complete --------|")
......@@ -19,10 +19,12 @@ name := "Spark Release Auditor"
version := "1.0"
scalaVersion := "2.9.3"
scalaVersion := System.getenv.get("SCALA_VERSION")
libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION")
resolvers ++= Seq(
"Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
"Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/",
"Maven Repository" at "http://repo1.maven.org/maven2/",
"Spray Repository" at "http://repo.spray.cc/")
......@@ -25,4 +25,5 @@ libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("S
resolvers ++= Seq(
"Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
"Maven Repository" at "http://repo1.maven.org/maven2/",
"Spray Repository" at "http://repo.spray.cc/")
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<!-- Hive Configuration can either be stored in this file or in the hadoop configuration files -->
<!-- that are implied by Hadoop setup variables. -->
<!-- Aside from Hadoop setup variables - this file is provided as a convenience so that Hive -->
<!-- users do not have to edit hadoop configuration files (that may be managed as a centralized -->
<!-- resource). -->
<!-- Hive Execution Parameters -->
<property name="build.dir" value="build" />
<property>
<name>build.dir</name>
<value>${user.dir}/build</value>
</property>
<property>
<name>build.dir.hive</name>
<value>${build.dir}/hive</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>${build.dir.hive}/test/hadoop-${user.name}</value>
<description>A base for other temporary directories.</description>
</property>
<!--
<property>
<name>hive.exec.reducers.max</name>
<value>1</value>
<description>maximum number of reducers</description>
</property>
-->
<property>
<name>hive.exec.scratchdir</name>
<value>${build.dir}/scratchdir</value>
<description>Scratch space for Hive jobs</description>
</property>
<property>
<name>hive.exec.local.scratchdir</name>
<value>${build.dir}/localscratchdir/</value>
<description>Local scratch space for Hive jobs</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<!-- note: variable substituion not working here because it's loaded by jdo, not Hive -->
<value>jdbc:derby:;databaseName=../build/test/junit_metastore_db;create=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>APP</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>mine</value>
</property>
<property>
<!-- this should eventually be deprecated since the metastore should supply this -->
<name>hive.metastore.warehouse.dir</name>
<value>${test.warehouse.dir}</value>
<description></description>
</property>
<property>
<name>hive.metastore.metadb.dir</name>
<value>${build.dir}/test/data/metadb/</value>
<description>
Required by metastore server or if the uris argument below is not supplied
</description>
</property>
<property>
<name>test.log.dir</name>
<value>${build.dir}/test/logs</value>
<description></description>
</property>
<property>
<name>test.src.dir</name>
<value>${build.dir}/src/test</value>
<description></description>
</property>
<!--
<property>
<name>test.data.files</name>
<value>${user.dir}/../data/files</value>
<description></description>
</property>
<property>
<name>test.query.file1</name>
<value>file://${user.dir}/../ql/src/test/org/apache/hadoop/hive/ql/input2.q</value>
<value></value>
<description></description>
</property>
-->
<property>
<name>hive.jar.path</name>
<value>${build.dir.hive}/ql/hive-exec-${version}.jar</value>
<description></description>
</property>
<property>
<name>hive.metastore.rawstore.impl</name>
<value>org.apache.hadoop.hive.metastore.ObjectStore</value>
<description>Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database</description>
</property>
<property>
<name>hive.querylog.location</name>
<value>${build.dir}/tmp</value>
<description>Location of the structured hive logs</description>
</property>
<!--
<property>
<name>hive.exec.pre.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.PreExecutePrinter, org.apache.hadoop.hive.ql.hooks.EnforceReadOnlyTables</value>
<description>Pre Execute Hook for Tests</description>
</property>
<property>
<name>hive.exec.post.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.PostExecutePrinter</value>
<description>Post Execute Hook for Tests</description>
</property>
-->
<property>
<name>hive.task.progress</name>
<value>false</value>
<description>Track progress of a task</description>
</property>
<property>
<name>hive.support.concurrency</name>
<value>false</value>
<description>Whether hive supports concurrency or not. A zookeeper instance must be up and running for the default hive lock manager to support read-write locks.</description>
</property>
<property>
<name>fs.pfile.impl</name>
<value>org.apache.hadoop.fs.ProxyLocalFileSystem</value>
<description>A proxy for local file system used for cross file system testing</description>
</property>
<property>
<name>hive.exec.mode.local.auto</name>
<value>false</value>
<description>
Let hive determine whether to run in local mode automatically
Disabling this for tests so that minimr is not affected
</description>
</property>
<property>
<name>hive.auto.convert.join</name>
<value>false</value>
<description>Whether Hive enable the optimization about converting common join into mapjoin based on the input file size</description>
</property>
<property>
<name>hive.ignore.mapjoin.hint</name>
<value>false</value>
<description>Whether Hive ignores the mapjoin hint</description>
</property>
<property>
<name>hive.input.format</name>
<value>org.apache.hadoop.hive.ql.io.CombineHiveInputFormat</value>
<description>The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombineHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombineHiveInputFormat, it can always be manually set to HiveInputFormat. </description>
</property>
<property>
<name>hive.default.rcfile.serde</name>
<value>org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe</value>
<description>The default SerDe hive will use for the rcfile format</description>
</property>
</configuration>
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment