[SPARK-5469] restructure pyspark.sql into multiple files

All the DataTypes moved into pyspark.sql.types The changes can be tracked by `--find-copies-harder -M25` ``` davieslocalhost:~/work/spark/python$ git diff --find-copies-harder -M25 --numstat master.. 2 5 python/docs/pyspark.ml.rst 0 3 python/docs/pyspark.mllib.rst 10 2 python/docs/pyspark.sql.rst 1 1 python/pyspark/mllib/linalg.py 21 14 python/pyspark/{mllib => sql}/__init__.py 14 2108 python/pyspark/{sql.py => sql/context.py} 10 1772 python/pyspark/{sql.py => sql/dataframe.py} 7 6 python/pyspark/{sql_tests.py => sql/tests.py} 8 1465 python/pyspark/{sql.py => sql/types.py} 4 2 python/run-tests 1 1 sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala ``` Also `git blame -C -C python/pyspark/sql/context.py` to track the history. Author: Davies Liu <davies@databricks.com> Closes #4479 from davies/sql and squashes the following commits: 1b5f0a5 [Davies Liu] Merge branch 'master' of github.com:apache/spark into sql 2b2b983 [Davies Liu] restructure pyspark.sql

[SPARK-5469] restructure pyspark.sql into multiple files
08488c17 · Davies Liu · Reynold Xin · d302c480 · 08488c17 · 08488c17
Commit 08488c17 authored 10 years ago by Davies Liu Committed by Reynold Xin 10 years ago
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
 pyspark.ml package
 =====================

-Submodules
----------
-
-pyspark.ml module
-----------------
+Module Context
+--------------

 .. automodule:: pyspark.ml
    :members:

--- a/python/docs/pyspark.mllib.rst
+++ b/python/docs/pyspark.mllib.rst
 pyspark.mllib package
 =====================

-Submodules
----------
-
 pyspark.mllib.classification module
 -----------------------------------


--- a/python/docs/pyspark.sql.rst
+++ b/python/docs/pyspark.sql.rst
 pyspark.sql module
 ==================

-Module contents
---------------
+Module Context
+--------------

 .. automodule:: pyspark.sql
    :members:
    :undoc-members:
    :show-inheritance:
+
+
+pyspark.sql.types module
+------------------------
+.. automodule:: pyspark.sql.types
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -29,7 +29,7 @@ import copy_reg

 import numpy as np

-from pyspark.sql import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
+from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
    IntegerType, ByteType



--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+public classes of Spark SQL:
+
+    - L{SQLContext}
+      Main entry point for SQL functionality.
+    - L{DataFrame}
+      A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
+      addition to normal RDD operations, DataFrames also support SQL.
+    - L{GroupedData}
+    - L{Column}
+      Column is a DataFrame with a single column.
+    - L{Row}
+      A Row of data returned by a Spark SQL query.
+    - L{HiveContext}
+      Main entry point for accessing data stored in Apache Hive..
+"""
+
+from pyspark.sql.context import SQLContext, HiveContext
+from pyspark.sql.types import Row
+from pyspark.sql.dataframe import DataFrame, GroupedData, Column, Dsl, SchemaRDD
+
+__all__ = [
+    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
+    'Dsl',
+]
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
--- a/python/pyspark/sql_tests.py
+++ b/python/pyspark/sql_tests.py
@@ -34,8 +34,10 @@ if sys.version_info[:2] <= (2, 6):
 else:
    import unittest

-from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType
+
+from pyspark.sql import SQLContext, Column
+from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
+    UserDefinedType, DoubleType, LongType
 from pyspark.tests import ReusedPySparkTestCase


@@ -220,7 +222,7 @@ class SQLTests(ReusedPySparkTestCase):
        self.assertEqual(1.0, row.asDict()['d']['key'].c)

    def test_infer_schema_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
        row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
        rdd = self.sc.parallelize([row])
        df = self.sqlCtx.inferSchema(rdd)
@@ -232,7 +234,7 @@ class SQLTests(ReusedPySparkTestCase):
        self.assertEqual(point, ExamplePoint(1.0, 2.0))

    def test_apply_schema_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
        row = (1.0, ExamplePoint(1.0, 2.0))
        rdd = self.sc.parallelize([row])
        schema = StructType([StructField("label", DoubleType(), False),
@@ -242,7 +244,7 @@ class SQLTests(ReusedPySparkTestCase):
        self.assertEquals(point, ExamplePoint(1.0, 2.0))

    def test_parquet_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint
+        from pyspark.sql.tests import ExamplePoint
        row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
        rdd = self.sc.parallelize([row])
        df0 = self.sqlCtx.inferSchema(rdd)
@@ -253,7 +255,6 @@ class SQLTests(ReusedPySparkTestCase):
        self.assertEquals(point, ExamplePoint(1.0, 2.0))

    def test_column_operators(self):
-        from pyspark.sql import Column, LongType
        ci = self.df.key
        cs = self.df.value
        c = ci == cs

--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
--- a/python/run-tests
+++ b/python/run-tests
@@ -64,8 +64,10 @@ function run_core_tests() {

 function run_sql_tests() {
    echo "Run sql tests ..."
-    run_test "pyspark/sql.py"
-    run_test "pyspark/sql_tests.py"
+    run_test "pyspark/sql/types.py"
+    run_test "pyspark/sql/context.py"
+    run_test "pyspark/sql/dataframe.py"
+    run_test "pyspark/sql/tests.py"
 }

 function run_mllib_tests() {

--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -37,7 +37,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] {

  override def sqlType: DataType = ArrayType(DoubleType, false)

-  override def pyUDT: String = "pyspark.sql_tests.ExamplePointUDT"
+  override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT"

  override def serialize(obj: Any): Seq[Double] = {
    obj match {