From 27e88faa058c1364d0e99fffc0c5cb64ef817bd3 Mon Sep 17 00:00:00 2001
From: Abou Haydar Elias <abouhaydar.elias@gmail.com>
Date: Fri, 4 Mar 2016 10:01:52 +0000
Subject: [PATCH] =?UTF-8?q?[SPARK-13646][MLLIB]=20QuantileDiscretizer=20co?=
 =?UTF-8?q?unts=20dataset=20twice=20in=20get=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

It avoids counting the dataframe twice.

Author: Abou Haydar Elias <abouhaydar.elias@gmail.com>
Author: Elie A <abouhaydar.elias@gmail.com>

Closes #11491 from eliasah/quantile-discretizer-patch.
---
 .../scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index d75b3ef420..18896fcc4d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -118,7 +118,7 @@ object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] wi
     require(totalSamples > 0,
       "QuantileDiscretizer requires non-empty input dataset but was given an empty input.")
     val requiredSamples = math.max(numBins * numBins, minSamplesRequired)
-    val fraction = math.min(requiredSamples.toDouble / dataset.count(), 1.0)
+    val fraction = math.min(requiredSamples.toDouble / totalSamples, 1.0)
     dataset.sample(withReplacement = false, fraction, new XORShiftRandom(seed).nextInt()).collect()
   }
 
-- 
GitLab