diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index a9887eb95279f7f4195f5b64c966bb058df8d1e4..615686ccbe2b39d445f49f4626697f73c610092b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1848,11 +1848,43 @@ class Dataset[T] private[sql]( Except(logicalPlan, other.logicalPlan) } + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement), + * using a user-supplied seed. + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * @param seed Seed for sampling. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double, seed: Long): Dataset[T] = { + sample(withReplacement = false, fraction = fraction, seed = seed) + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement). + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double): Dataset[T] = { + sample(withReplacement = false, fraction = fraction) + } + /** * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * @param seed Seed for sampling. * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -1871,7 +1903,7 @@ class Dataset[T] private[sql]( * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * * @note This is NOT guaranteed to provide exactly the fraction of the total count * of the given [[Dataset]].