Skip to content
Snippets Groups Projects
Commit cffb899c authored by Wenchen Fan's avatar Wenchen Fan Committed by Michael Armbrust
Browse files

[SPARK-11803][SQL] fix Dataset self-join

When we resolve the join operator, we may change the output of right side if self-join is detected. So in `Dataset.joinWith`, we should resolve the join operator first, and then get the left output and right output from it, instead of using `left.output` and `right.output` directly.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9806 from cloud-fan/self-join.
parent 3cca5ffb
No related branches found
No related tags found
No related merge requests found
......@@ -498,13 +498,17 @@ class Dataset[T] private[sql](
val left = this.logicalPlan
val right = other.logicalPlan
val joined = sqlContext.executePlan(Join(left, right, Inner, Some(condition.expr)))
val leftOutput = joined.analyzed.output.take(left.output.length)
val rightOutput = joined.analyzed.output.takeRight(right.output.length)
val leftData = this.unresolvedTEncoder match {
case e if e.flat => Alias(left.output.head, "_1")()
case _ => Alias(CreateStruct(left.output), "_1")()
case e if e.flat => Alias(leftOutput.head, "_1")()
case _ => Alias(CreateStruct(leftOutput), "_1")()
}
val rightData = other.unresolvedTEncoder match {
case e if e.flat => Alias(right.output.head, "_2")()
case _ => Alias(CreateStruct(right.output), "_2")()
case e if e.flat => Alias(rightOutput.head, "_2")()
case _ => Alias(CreateStruct(rightOutput), "_2")()
}
......@@ -513,7 +517,7 @@ class Dataset[T] private[sql](
withPlan[(T, U)](other) { (left, right) =>
Project(
leftData :: rightData :: Nil,
Join(left, right, Inner, Some(condition.expr)))
joined.analyzed)
}
}
......
......@@ -347,7 +347,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
checkAnswer(joined, ("2", 2))
}
ignore("self join") {
test("self join") {
val ds = Seq("1", "2").toDS().as("a")
val joined = ds.joinWith(ds, lit(true))
checkAnswer(joined, ("1", "1"), ("1", "2"), ("2", "1"), ("2", "2"))
......@@ -360,15 +360,15 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
test("kryo encoder") {
implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = sqlContext.createDataset(Seq(KryoData(1), KryoData(2)))
val ds = Seq(KryoData(1), KryoData(2)).toDS()
assert(ds.groupBy(p => p).count().collect().toSeq ==
Seq((KryoData(1), 1L), (KryoData(2), 1L)))
}
ignore("kryo encoder self join") {
test("kryo encoder self join") {
implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = sqlContext.createDataset(Seq(KryoData(1), KryoData(2)))
val ds = Seq(KryoData(1), KryoData(2)).toDS()
assert(ds.joinWith(ds, lit(true)).collect().toSet ==
Set(
(KryoData(1), KryoData(1)),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment