From a5c10ff238e4a117f597e017b7d746404aaa1663 Mon Sep 17 00:00:00 2001 From: Holden Karau <holden@us.ibm.com> Date: Wed, 25 Jan 2017 14:43:39 -0800 Subject: [PATCH] [SPARK-19064][PYSPARK] Fix pip installing of sub components ## What changes were proposed in this pull request? Fix instalation of mllib and ml sub components, and more eagerly cleanup cache files during test script & make-distribution. ## How was this patch tested? Updated sanity test script to import mllib and ml sub-components. Author: Holden Karau <holden@us.ibm.com> Closes #16465 from holdenk/SPARK-19064-fix-pip-install-sub-components. (cherry picked from commit 965c82d8c4b7f2d4dfbc45ec4d47d6b6588094c3) Signed-off-by: Holden Karau <holden@us.ibm.com> --- dev/make-distribution.sh | 2 ++ dev/pip-sanity-check.py | 2 ++ dev/requirements.txt | 1 + dev/run-pip-tests | 7 +++++-- python/setup.py | 5 +++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 6ea319e436..00e0580a34 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -213,6 +213,8 @@ cp -r "$SPARK_HOME/data" "$DISTDIR" if [ "$MAKE_PIP" == "true" ]; then echo "Building python distribution package" pushd "$SPARK_HOME/python" > /dev/null + # Delete the egg info file if it exists, this can cache older setup files. + rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" python setup.py sdist popd > /dev/null else diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 430c2ab527..c491005f49 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -18,6 +18,8 @@ from __future__ import print_function from pyspark.sql import SparkSession +from pyspark.ml.param import Params +from pyspark.mllib.linalg import * import sys if __name__ == "__main__": diff --git a/dev/requirements.txt b/dev/requirements.txt index bf042d22a8..79782279f8 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -1,3 +1,4 @@ jira==1.0.3 PyGithub==1.26.0 Unidecode==0.04.19 +pypandoc==1.3.3 diff --git a/dev/run-pip-tests b/dev/run-pip-tests index e1da18e60b..af1b1feb70 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -78,11 +78,14 @@ for python in "${PYTHON_EXECS[@]}"; do mkdir -p "$VIRTUALENV_PATH" virtualenv --python=$python "$VIRTUALENV_PATH" source "$VIRTUALENV_PATH"/bin/activate - # Upgrade pip - pip install --upgrade pip + # Upgrade pip & friends + pip install --upgrade pip pypandoc wheel + pip install numpy # Needed so we can verify mllib imports echo "Creating pip installable source dist" cd "$FWDIR"/python + # Delete the egg info file if it exists, this can cache the setup file. + rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" $python setup.py sdist diff --git a/python/setup.py b/python/setup.py index bc2eb4ce9d..47eab98e0f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -162,7 +162,12 @@ try: url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', + 'pyspark.mllib.linalg', + 'pyspark.mllib.stat', 'pyspark.ml', + 'pyspark.ml.linalg', + 'pyspark.ml.param', + 'pyspark.ml.stat', 'pyspark.sql', 'pyspark.streaming', 'pyspark.bin', -- GitLab