From 07a2b8738ed8e6c136545d03f91a865de05e41a0 Mon Sep 17 00:00:00 2001 From: Reynold Xin <rxin@databricks.com> Date: Fri, 18 Aug 2017 23:58:20 +0900 Subject: [PATCH] [SPARK-21778][SQL] Simpler Dataset.sample API in Scala / Java ## What changes were proposed in this pull request? Dataset.sample requires a boolean flag withReplacement as the first argument. However, most of the time users simply want to sample some records without replacement. This ticket introduces a new sample function that simply takes in the fraction and seed. ## How was this patch tested? Tested manually. Not sure yet if we should add a test case for just this wrapper ... Author: Reynold Xin <rxin@databricks.com> Closes #18988 from rxin/SPARK-21778. --- .../scala/org/apache/spark/sql/Dataset.scala | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index a9887eb952..615686ccbe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1848,11 +1848,43 @@ class Dataset[T] private[sql]( Except(logicalPlan, other.logicalPlan) } + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement), + * using a user-supplied seed. + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * @param seed Seed for sampling. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double, seed: Long): Dataset[T] = { + sample(withReplacement = false, fraction = fraction, seed = seed) + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement). + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double): Dataset[T] = { + sample(withReplacement = false, fraction = fraction) + } + /** * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * @param seed Seed for sampling. * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -1871,7 +1903,7 @@ class Dataset[T] private[sql]( * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * * @note This is NOT guaranteed to provide exactly the fraction of the total count * of the given [[Dataset]]. -- GitLab