diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala index b7b1acc58242ef19d7154bedc18ea45099a682da..c741a2dd3ea3041151b431f52f684ee05bff4641 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala @@ -37,8 +37,7 @@ private[sql] object ObjectType extends AbstractDataType { * outside of the execution engine. */ private[sql] case class ObjectType(cls: Class[_]) extends DataType { - override def defaultSize: Int = - throw new UnsupportedOperationException("No size estimation available for objects.") + override def defaultSize: Int = 4096 def asNullable: DataType = this diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index a6e3bd3a91a15555be42f127ef4bb23fe432f3bf..eee21acf7510b6770f846b9755bbd093fed2af5f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -630,6 +630,29 @@ class DatasetSuite extends QueryTest with SharedSQLContext { // Make sure the generated code for this plan can compile and execute. checkDataset(wideDF.map(_.getLong(0)), 0L until 10 : _*) } + + test("SPARK-14838: estimating sizeInBytes in operators with ObjectProducer shouldn't fail") { + val dataset = Seq( + (0, 3, 54f), + (0, 4, 44f), + (0, 5, 42f), + (1, 3, 39f), + (1, 5, 33f), + (1, 4, 26f), + (2, 3, 51f), + (2, 5, 45f), + (2, 4, 30f) + ).toDF("user", "item", "rating") + + val actual = dataset + .select("user", "item") + .as[(Int, Int)] + .groupByKey(_._1) + .mapGroups { case (src, ids) => (src, ids.map(_._2).toArray) } + .toDF("id", "actual") + + dataset.join(actual, dataset("user") === actual("id")).collect() + } } case class OtherTuple(_1: String, _2: Int)