From 5e1b0ef07942a041195b3decd05d86c289bc8d2b Mon Sep 17 00:00:00 2001 From: Reynold Xin <rxin@databricks.com> Date: Thu, 6 Aug 2015 10:39:16 -0700 Subject: [PATCH] [SPARK-9659][SQL] Rename inSet to isin to match Pandas function. Inspiration drawn from this blog post: https://lab.getbase.com/pandarize-spark-dataframes/ Author: Reynold Xin <rxin@databricks.com> Closes #7977 from rxin/isin and squashes the following commits: 9b1d3d6 [Reynold Xin] Added return. 2197d37 [Reynold Xin] Fixed test case. 7c1b6cf [Reynold Xin] Import warnings. 4f4a35d [Reynold Xin] [SPARK-9659][SQL] Rename inSet to isin to match Pandas function. --- python/pyspark/sql/column.py | 20 ++++++++++++++++++- .../scala/org/apache/spark/sql/Column.scala | 13 +++++++++++- .../spark/sql/ColumnExpressionSuite.scala | 14 ++++++------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 0a85da7443..8af8637cf9 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -16,6 +16,7 @@ # import sys +import warnings if sys.version >= '3': basestring = str @@ -254,12 +255,29 @@ class Column(object): [Row(age=5, name=u'Bob')] >>> df[df.age.inSet([1, 2, 3])].collect() [Row(age=2, name=u'Alice')] + + .. note:: Deprecated in 1.5, use :func:`Column.isin` instead. + """ + warnings.warn("inSet is deprecated. Use isin() instead.") + return self.isin(*cols) + + @ignore_unicode_prefix + @since(1.5) + def isin(self, *cols): + """ + A boolean expression that is evaluated to true if the value of this + expression is contained by the evaluated values of the arguments. + + >>> df[df.name.isin("Bob", "Mike")].collect() + [Row(age=5, name=u'Bob')] + >>> df[df.age.isin([1, 2, 3])].collect() + [Row(age=2, name=u'Alice')] """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols] sc = SparkContext._active_spark_context - jc = getattr(self._jc, "in")(_to_seq(sc, cols)) + jc = getattr(self._jc, "isin")(_to_seq(sc, cols)) return Column(jc) # order diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index b25dcbca82..75365fbcec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -627,8 +627,19 @@ class Column(protected[sql] val expr: Expression) extends Logging { * @group expr_ops * @since 1.3.0 */ + @deprecated("use isin", "1.5.0") @scala.annotation.varargs - def in(list: Any*): Column = In(expr, list.map(lit(_).expr)) + def in(list: Any*): Column = isin(list : _*) + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the evaluated values of the arguments. + * + * @group expr_ops + * @since 1.5.0 + */ + @scala.annotation.varargs + def isin(list: Any*): Column = In(expr, list.map(lit(_).expr)) /** * SQL like expression. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index b351380373..e1b3443d74 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -345,23 +345,23 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils { test("in") { val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") - checkAnswer(df.filter($"a".in(1, 2)), + checkAnswer(df.filter($"a".isin(1, 2)), df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".in(3, 2)), + checkAnswer(df.filter($"a".isin(3, 2)), df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".in(3, 1)), + checkAnswer(df.filter($"a".isin(3, 1)), df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - checkAnswer(df.filter($"b".in("y", "x")), + checkAnswer(df.filter($"b".isin("y", "x")), df.collect().toSeq.filter(r => r.getString(1) == "y" || r.getString(1) == "x")) - checkAnswer(df.filter($"b".in("z", "x")), + checkAnswer(df.filter($"b".isin("z", "x")), df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "x")) - checkAnswer(df.filter($"b".in("z", "y")), + checkAnswer(df.filter($"b".isin("z", "y")), df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "y")) val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") intercept[AnalysisException] { - df2.filter($"a".in($"b")) + df2.filter($"a".isin($"b")) } } -- GitLab