From 82e890fb19d6fbaffa69856eecb4699f2f8a81eb Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin <vanzin@cloudera.com> Date: Tue, 12 May 2015 01:39:21 -0700 Subject: [PATCH] [SPARK-7485] [BUILD] Remove pyspark files from assembly. The sbt part of the build is hacky; it basically tricks sbt into generating the zip by using a generator, but returns an empty list for the generated files so that nothing is actually added to the assembly. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #6022 from vanzin/SPARK-7485 and squashes the following commits: 22c1e04 [Marcelo Vanzin] Remove unneeded code. 4893622 [Marcelo Vanzin] [SPARK-7485] [build] Remove pyspark files from assembly. --- core/pom.xml | 47 ---------------------------------------- mllib/pom.xml | 11 ---------- project/SparkBuild.scala | 44 +++---------------------------------- sql/core/pom.xml | 8 ------- streaming/pom.xml | 8 ------- 5 files changed, 3 insertions(+), 115 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index fc42f48973..262a3320db 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -381,35 +381,6 @@ <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory> <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory> <plugins> - <!-- Unzip py4j so we can include its files in the jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-antrun-plugin</artifactId> - <executions> - <execution> - <phase>generate-resources</phase> - <goals> - <goal>run</goal> - </goals> - </execution> - </executions> - <configuration> - <target> - <unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" /> - </target> - </configuration> - </plugin> - <plugin> - <artifactId>maven-clean-plugin</artifactId> - <configuration> - <filesets> - <fileset> - <directory>${basedir}/../python/build</directory> - </fileset> - </filesets> - <verbose>true</verbose> - </configuration> - </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-dependency-plugin</artifactId> @@ -438,24 +409,6 @@ </executions> </plugin> </plugins> - - <resources> - <resource> - <directory>src/main/resources</directory> - </resource> - <resource> - <directory>../python</directory> - <includes> - <include>pyspark/*.py</include> - </includes> - </resource> - <resource> - <directory>../python/build</directory> - <includes> - <include>py4j/*.py</include> - </includes> - </resource> - </resources> </build> <profiles> diff --git a/mllib/pom.xml b/mllib/pom.xml index a3c57ae260..0c07ca1a62 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -141,16 +141,5 @@ <build> <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory> <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory> - <resources> - <resource> - <directory>../python</directory> - <includes> - <include>pyspark/mllib/*.py</include> - <include>pyspark/mllib/stat/*.py</include> - <include>pyspark/ml/*.py</include> - <include>pyspark/ml/param/*.py</include> - </includes> - </resource> - </resources> </build> </project> diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 186345af0e..1b87e4e98b 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -168,7 +168,7 @@ object SparkBuild extends PomBuild { /* Enable Assembly for all assembly projects */ assemblyProjects.foreach(enable(Assembly.settings)) - /* Package pyspark artifacts in the main assembly. */ + /* Package pyspark artifacts in a separate zip file for YARN. */ enable(PySparkAssembly.settings)(assembly) /* Enable unidoc only for the root spark project */ @@ -373,22 +373,15 @@ object PySparkAssembly { import java.util.zip.{ZipOutputStream, ZipEntry} lazy val settings = Seq( - unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" }, // Use a resource generator to copy all .py files from python/pyspark into a managed directory // to be included in the assembly. We can't just add "python/" to the assembly's resource dir // list since that will copy unneeded / unwanted files. resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File => val src = new File(BuildCommons.sparkHome, "python/pyspark") - val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip") zipFile.delete() zipRecursive(src, zipFile) - - val dst = new File(outDir, "pyspark") - if (!dst.isDirectory()) { - require(dst.mkdirs()) - } - copy(src, dst) + Seq[File]() } ) @@ -416,42 +409,11 @@ object PySparkAssembly { output.write(buf, 0, n) } } + output.closeEntry() in.close() } } - private def copy(src: File, dst: File): Seq[File] = { - src.listFiles().flatMap { f => - val child = new File(dst, f.getName()) - if (f.isDirectory()) { - child.mkdir() - copy(f, child) - } else if (f.getName().endsWith(".py")) { - var in: Option[FileInputStream] = None - var out: Option[FileOutputStream] = None - try { - in = Some(new FileInputStream(f)) - out = Some(new FileOutputStream(child)) - - val bytes = new Array[Byte](1024) - var read = 0 - while (read >= 0) { - read = in.get.read(bytes) - if (read > 0) { - out.get.write(bytes, 0, read) - } - } - - Some(child) - } finally { - in.foreach(_.close()) - out.foreach(_.close()) - } - } else { - None - } - } - } } object Unidoc { diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 7d274a73e0..ffe95bb491 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -103,13 +103,5 @@ <build> <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory> <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory> - <resources> - <resource> - <directory>../../python</directory> - <includes> - <include>pyspark/sql/*.py</include> - </includes> - </resource> - </resources> </build> </project> diff --git a/streaming/pom.xml b/streaming/pom.xml index 5ca55a4f68..5ab7f4472c 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -105,13 +105,5 @@ </configuration> </plugin> </plugins> - <resources> - <resource> - <directory>../python</directory> - <includes> - <include>pyspark/streaming/*.py</include> - </includes> - </resource> - </resources> </build> </project> -- GitLab