Skip to content
Snippets Groups Projects
Commit ca849ac4 authored by Shixiong Zhu's avatar Shixiong Zhu
Browse files

[SPARK-19841][SS] watermarkPredicate should filter based on keys

## What changes were proposed in this pull request?

`StreamingDeduplicateExec.watermarkPredicate` should filter based on keys. Otherwise, it may generate a wrong answer if the watermark column in `keyExpression` has a different position in the row.

`StateStoreSaveExec` has the same codes but its parent can makes sure the watermark column positions in `keyExpression` and `row` are the same.

## How was this patch tested?

The added test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #17183 from zsxwing/SPARK-19841.
parent b9783a92
No related branches found
No related tags found
No related merge requests found
...@@ -68,7 +68,7 @@ trait StateStoreWriter extends StatefulOperator { ...@@ -68,7 +68,7 @@ trait StateStoreWriter extends StatefulOperator {
} }
/** An operator that supports watermark. */ /** An operator that supports watermark. */
trait WatermarkSupport extends SparkPlan { trait WatermarkSupport extends UnaryExecNode {
/** The keys that may have a watermark attribute. */ /** The keys that may have a watermark attribute. */
def keyExpressions: Seq[Attribute] def keyExpressions: Seq[Attribute]
...@@ -76,8 +76,8 @@ trait WatermarkSupport extends SparkPlan { ...@@ -76,8 +76,8 @@ trait WatermarkSupport extends SparkPlan {
/** The watermark value. */ /** The watermark value. */
def eventTimeWatermark: Option[Long] def eventTimeWatermark: Option[Long]
/** Generate a predicate that matches data older than the watermark */ /** Generate an expression that matches data older than the watermark */
lazy val watermarkPredicate: Option[Predicate] = { lazy val watermarkExpression: Option[Expression] = {
val optionalWatermarkAttribute = val optionalWatermarkAttribute =
keyExpressions.find(_.metadata.contains(EventTimeWatermark.delayKey)) keyExpressions.find(_.metadata.contains(EventTimeWatermark.delayKey))
...@@ -96,9 +96,19 @@ trait WatermarkSupport extends SparkPlan { ...@@ -96,9 +96,19 @@ trait WatermarkSupport extends SparkPlan {
} }
logInfo(s"Filtering state store on: $evictionExpression") logInfo(s"Filtering state store on: $evictionExpression")
newPredicate(evictionExpression, keyExpressions) evictionExpression
} }
} }
/** Generate a predicate based on keys that matches data older than the watermark */
lazy val watermarkPredicateForKeys: Option[Predicate] =
watermarkExpression.map(newPredicate(_, keyExpressions))
/**
* Generate a predicate based on the child output that matches data older than the watermark.
*/
lazy val watermarkPredicate: Option[Predicate] =
watermarkExpression.map(newPredicate(_, child.output))
} }
/** /**
...@@ -192,7 +202,7 @@ case class StateStoreSaveExec( ...@@ -192,7 +202,7 @@ case class StateStoreSaveExec(
} }
// Assumption: Append mode can be done only when watermark has been specified // Assumption: Append mode can be done only when watermark has been specified
store.remove(watermarkPredicate.get.eval _) store.remove(watermarkPredicateForKeys.get.eval _)
store.commit() store.commit()
numTotalStateRows += store.numKeys() numTotalStateRows += store.numKeys()
...@@ -215,7 +225,9 @@ case class StateStoreSaveExec( ...@@ -215,7 +225,9 @@ case class StateStoreSaveExec(
override def hasNext: Boolean = { override def hasNext: Boolean = {
if (!baseIterator.hasNext) { if (!baseIterator.hasNext) {
// Remove old aggregates if watermark specified // Remove old aggregates if watermark specified
if (watermarkPredicate.nonEmpty) store.remove(watermarkPredicate.get.eval _) if (watermarkPredicateForKeys.nonEmpty) {
store.remove(watermarkPredicateForKeys.get.eval _)
}
store.commit() store.commit()
numTotalStateRows += store.numKeys() numTotalStateRows += store.numKeys()
false false
...@@ -361,7 +373,7 @@ case class StreamingDeduplicateExec( ...@@ -361,7 +373,7 @@ case class StreamingDeduplicateExec(
val numUpdatedStateRows = longMetric("numUpdatedStateRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows")
val baseIterator = watermarkPredicate match { val baseIterator = watermarkPredicate match {
case Some(predicate) => iter.filter((row: InternalRow) => !predicate.eval(row)) case Some(predicate) => iter.filter(row => !predicate.eval(row))
case None => iter case None => iter
} }
...@@ -381,7 +393,7 @@ case class StreamingDeduplicateExec( ...@@ -381,7 +393,7 @@ case class StreamingDeduplicateExec(
} }
CompletionIterator[InternalRow, Iterator[InternalRow]](result, { CompletionIterator[InternalRow, Iterator[InternalRow]](result, {
watermarkPredicate.foreach(f => store.remove(f.eval _)) watermarkPredicateForKeys.foreach(f => store.remove(f.eval _))
store.commit() store.commit()
numTotalStateRows += store.numKeys() numTotalStateRows += store.numKeys()
}) })
......
...@@ -249,4 +249,23 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll { ...@@ -249,4 +249,23 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
} }
} }
} }
test("SPARK-19841: watermarkPredicate should filter based on keys") {
val input = MemoryStream[(Int, Int)]
val df = input.toDS.toDF("time", "id")
.withColumn("time", $"time".cast("timestamp"))
.withWatermark("time", "1 second")
.dropDuplicates("id", "time") // Change the column positions
.select($"id")
testStream(df)(
AddData(input, 1 -> 1, 1 -> 1, 1 -> 2),
CheckLastBatch(1, 2),
AddData(input, 1 -> 1, 2 -> 3, 2 -> 4),
CheckLastBatch(3, 4),
AddData(input, 1 -> 0, 1 -> 1, 3 -> 5, 3 -> 6), // Drop (1 -> 0, 1 -> 1) due to watermark
CheckLastBatch(5, 6),
AddData(input, 1 -> 0, 4 -> 7), // Drop (1 -> 0) due to watermark
CheckLastBatch(7)
)
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment