Skip to content
Snippets Groups Projects
Commit 08488c17 authored by Davies Liu's avatar Davies Liu Committed by Reynold Xin
Browse files

[SPARK-5469] restructure pyspark.sql into multiple files

All the DataTypes moved into pyspark.sql.types

The changes can be tracked by `--find-copies-harder -M25`
```
davieslocalhost:~/work/spark/python$ git diff --find-copies-harder -M25 --numstat master..
2       5       python/docs/pyspark.ml.rst
0       3       python/docs/pyspark.mllib.rst
10      2       python/docs/pyspark.sql.rst
1       1       python/pyspark/mllib/linalg.py
21      14      python/pyspark/{mllib => sql}/__init__.py
14      2108    python/pyspark/{sql.py => sql/context.py}
10      1772    python/pyspark/{sql.py => sql/dataframe.py}
7       6       python/pyspark/{sql_tests.py => sql/tests.py}
8       1465    python/pyspark/{sql.py => sql/types.py}
4       2       python/run-tests
1       1       sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
```

Also `git blame -C -C python/pyspark/sql/context.py` to track the history.

Author: Davies Liu <davies@databricks.com>

Closes #4479 from davies/sql and squashes the following commits:

1b5f0a5 [Davies Liu] Merge branch 'master' of github.com:apache/spark into sql
2b2b983 [Davies Liu] restructure pyspark.sql
parent d302c480
No related branches found
No related tags found
No related merge requests found
pyspark.ml package
=====================
Submodules
----------
pyspark.ml module
-----------------
Module Context
--------------
.. automodule:: pyspark.ml
:members:
......
pyspark.mllib package
=====================
Submodules
----------
pyspark.mllib.classification module
-----------------------------------
......
pyspark.sql module
==================
Module contents
---------------
Module Context
--------------
.. automodule:: pyspark.sql
:members:
:undoc-members:
:show-inheritance:
pyspark.sql.types module
------------------------
.. automodule:: pyspark.sql.types
:members:
:undoc-members:
:show-inheritance:
......@@ -29,7 +29,7 @@ import copy_reg
import numpy as np
from pyspark.sql import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
IntegerType, ByteType
......
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
public classes of Spark SQL:
- L{SQLContext}
Main entry point for SQL functionality.
- L{DataFrame}
A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
addition to normal RDD operations, DataFrames also support SQL.
- L{GroupedData}
- L{Column}
Column is a DataFrame with a single column.
- L{Row}
A Row of data returned by a Spark SQL query.
- L{HiveContext}
Main entry point for accessing data stored in Apache Hive..
"""
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.types import Row
from pyspark.sql.dataframe import DataFrame, GroupedData, Column, Dsl, SchemaRDD
__all__ = [
'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
'Dsl',
]
This diff is collapsed.
This diff is collapsed.
......@@ -34,8 +34,10 @@ if sys.version_info[:2] <= (2, 6):
else:
import unittest
from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
UserDefinedType, DoubleType
from pyspark.sql import SQLContext, Column
from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
UserDefinedType, DoubleType, LongType
from pyspark.tests import ReusedPySparkTestCase
......@@ -220,7 +222,7 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEqual(1.0, row.asDict()['d']['key'].c)
def test_infer_schema_with_udt(self):
from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
rdd = self.sc.parallelize([row])
df = self.sqlCtx.inferSchema(rdd)
......@@ -232,7 +234,7 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEqual(point, ExamplePoint(1.0, 2.0))
def test_apply_schema_with_udt(self):
from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
row = (1.0, ExamplePoint(1.0, 2.0))
rdd = self.sc.parallelize([row])
schema = StructType([StructField("label", DoubleType(), False),
......@@ -242,7 +244,7 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEquals(point, ExamplePoint(1.0, 2.0))
def test_parquet_with_udt(self):
from pyspark.sql_tests import ExamplePoint
from pyspark.sql.tests import ExamplePoint
row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
rdd = self.sc.parallelize([row])
df0 = self.sqlCtx.inferSchema(rdd)
......@@ -253,7 +255,6 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEquals(point, ExamplePoint(1.0, 2.0))
def test_column_operators(self):
from pyspark.sql import Column, LongType
ci = self.df.key
cs = self.df.value
c = ci == cs
......
This diff is collapsed.
......@@ -64,8 +64,10 @@ function run_core_tests() {
function run_sql_tests() {
echo "Run sql tests ..."
run_test "pyspark/sql.py"
run_test "pyspark/sql_tests.py"
run_test "pyspark/sql/types.py"
run_test "pyspark/sql/context.py"
run_test "pyspark/sql/dataframe.py"
run_test "pyspark/sql/tests.py"
}
function run_mllib_tests() {
......
......@@ -37,7 +37,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] {
override def sqlType: DataType = ArrayType(DoubleType, false)
override def pyUDT: String = "pyspark.sql_tests.ExamplePointUDT"
override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT"
override def serialize(obj: Any): Seq[Double] = {
obj match {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment