diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala index 6addab69f13ecbba8251523cc26c30b6426f4273..e46356392c51b7b5aee0434140b8d70bd78b1f51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala @@ -156,8 +156,13 @@ trait WatermarkSupport extends UnaryExecNode { } /** Predicate based on keys that matches data older than the watermark */ - lazy val watermarkPredicateForKeys: Option[Predicate] = - watermarkExpression.map(newPredicate(_, keyExpressions)) + lazy val watermarkPredicateForKeys: Option[Predicate] = watermarkExpression.flatMap { e => + if (keyExpressions.exists(_.metadata.contains(EventTimeWatermark.delayKey))) { + Some(newPredicate(e, keyExpressions)) + } else { + None + } + } /** Predicate based on the child output that matches data older than the watermark. */ lazy val watermarkPredicateForData: Option[Predicate] = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala index a15c2cff930fc287ead95186779dd6a7920a8df3..e858b7d9998a87fea65d466efcc979411fa723b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala @@ -268,4 +268,17 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll { CheckLastBatch(7) ) } + + test("SPARK-21546: dropDuplicates should ignore watermark when it's not a key") { + val input = MemoryStream[(Int, Int)] + val df = input.toDS.toDF("id", "time") + .withColumn("time", $"time".cast("timestamp")) + .withWatermark("time", "1 second") + .dropDuplicates("id") + .select($"id", $"time".cast("long")) + testStream(df)( + AddData(input, 1 -> 1, 1 -> 2, 2 -> 2), + CheckLastBatch(1 -> 1, 2 -> 2) + ) + } }