Skip to content
Snippets Groups Projects
Commit ac6411c6 authored by Xiangrui Meng's avatar Xiangrui Meng
Browse files

[SPARK-3081][MLLIB] rename RandomRDDGenerators to RandomRDDs

`RandomRDDGenerators` means factory for `RandomRDDGenerator`. However, its methods return RDDs but not RDDGenerators. So a more proper (and shorter) name would be `RandomRDDs`.

dorx brkyvz

Author: Xiangrui Meng <meng@databricks.com>

Closes #1979 from mengxr/randomrdds and squashes the following commits:

b161a2d [Xiangrui Meng] rename RandomRDDGenerators to RandomRDDs
parent 7e70708a
No related branches found
No related tags found
No related merge requests found
...@@ -27,7 +27,7 @@ import org.apache.spark.mllib.classification._ ...@@ -27,7 +27,7 @@ import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.clustering._ import org.apache.spark.mllib.clustering._
import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors} import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.random.{RandomRDDGenerators => RG} import org.apache.spark.mllib.random.{RandomRDDs => RG}
import org.apache.spark.mllib.recommendation._ import org.apache.spark.mllib.recommendation._
import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
package org.apache.spark.mllib.random package org.apache.spark.mllib.random
import scala.reflect.ClassTag
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.Experimental import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vector
...@@ -24,14 +26,12 @@ import org.apache.spark.mllib.rdd.{RandomVectorRDD, RandomRDD} ...@@ -24,14 +26,12 @@ import org.apache.spark.mllib.rdd.{RandomVectorRDD, RandomRDD}
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils import org.apache.spark.util.Utils
import scala.reflect.ClassTag
/** /**
* :: Experimental :: * :: Experimental ::
* Generator methods for creating RDDs comprised of i.i.d. samples from some distribution. * Generator methods for creating RDDs comprised of i.i.d. samples from some distribution.
*/ */
@Experimental @Experimental
object RandomRDDGenerators { object RandomRDDs {
/** /**
* :: Experimental :: * :: Experimental ::
......
...@@ -34,7 +34,7 @@ import org.apache.spark.util.StatCounter ...@@ -34,7 +34,7 @@ import org.apache.spark.util.StatCounter
* *
* TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged * TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
*/ */
class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Serializable { class RandomRDDsSuite extends FunSuite with LocalSparkContext with Serializable {
def testGeneratedRDD(rdd: RDD[Double], def testGeneratedRDD(rdd: RDD[Double],
expectedSize: Long, expectedSize: Long,
...@@ -113,18 +113,18 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri ...@@ -113,18 +113,18 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri
val poissonMean = 100.0 val poissonMean = 100.0
for (seed <- 0 until 5) { for (seed <- 0 until 5) {
val uniform = RandomRDDGenerators.uniformRDD(sc, size, numPartitions, seed) val uniform = RandomRDDs.uniformRDD(sc, size, numPartitions, seed)
testGeneratedRDD(uniform, size, numPartitions, 0.5, 1 / math.sqrt(12)) testGeneratedRDD(uniform, size, numPartitions, 0.5, 1 / math.sqrt(12))
val normal = RandomRDDGenerators.normalRDD(sc, size, numPartitions, seed) val normal = RandomRDDs.normalRDD(sc, size, numPartitions, seed)
testGeneratedRDD(normal, size, numPartitions, 0.0, 1.0) testGeneratedRDD(normal, size, numPartitions, 0.0, 1.0)
val poisson = RandomRDDGenerators.poissonRDD(sc, poissonMean, size, numPartitions, seed) val poisson = RandomRDDs.poissonRDD(sc, poissonMean, size, numPartitions, seed)
testGeneratedRDD(poisson, size, numPartitions, poissonMean, math.sqrt(poissonMean), 0.1) testGeneratedRDD(poisson, size, numPartitions, poissonMean, math.sqrt(poissonMean), 0.1)
} }
// mock distribution to check that partitions have unique seeds // mock distribution to check that partitions have unique seeds
val random = RandomRDDGenerators.randomRDD(sc, new MockDistro(), 1000L, 1000, 0L) val random = RandomRDDs.randomRDD(sc, new MockDistro(), 1000L, 1000, 0L)
assert(random.collect.size === random.collect.distinct.size) assert(random.collect.size === random.collect.distinct.size)
} }
...@@ -135,13 +135,13 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri ...@@ -135,13 +135,13 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri
val poissonMean = 100.0 val poissonMean = 100.0
for (seed <- 0 until 5) { for (seed <- 0 until 5) {
val uniform = RandomRDDGenerators.uniformVectorRDD(sc, rows, cols, parts, seed) val uniform = RandomRDDs.uniformVectorRDD(sc, rows, cols, parts, seed)
testGeneratedVectorRDD(uniform, rows, cols, parts, 0.5, 1 / math.sqrt(12)) testGeneratedVectorRDD(uniform, rows, cols, parts, 0.5, 1 / math.sqrt(12))
val normal = RandomRDDGenerators.normalVectorRDD(sc, rows, cols, parts, seed) val normal = RandomRDDs.normalVectorRDD(sc, rows, cols, parts, seed)
testGeneratedVectorRDD(normal, rows, cols, parts, 0.0, 1.0) testGeneratedVectorRDD(normal, rows, cols, parts, 0.0, 1.0)
val poisson = RandomRDDGenerators.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed) val poisson = RandomRDDs.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed)
testGeneratedVectorRDD(poisson, rows, cols, parts, poissonMean, math.sqrt(poissonMean), 0.1) testGeneratedVectorRDD(poisson, rows, cols, parts, poissonMean, math.sqrt(poissonMean), 0.1)
} }
} }
......
...@@ -25,8 +25,7 @@ from pyspark.mllib._common import _deserialize_double, _deserialize_double_vecto ...@@ -25,8 +25,7 @@ from pyspark.mllib._common import _deserialize_double, _deserialize_double_vecto
from pyspark.serializers import NoOpSerializer from pyspark.serializers import NoOpSerializer
class RandomRDDGenerators: class RandomRDDs:
""" """
Generator methods for creating RDDs comprised of i.i.d samples from Generator methods for creating RDDs comprised of i.i.d samples from
some distribution. some distribution.
...@@ -40,17 +39,17 @@ class RandomRDDGenerators: ...@@ -40,17 +39,17 @@ class RandomRDDGenerators:
To transform the distribution in the generated RDD from U[0.0, 1.0] To transform the distribution in the generated RDD from U[0.0, 1.0]
to U[a, b], use to U[a, b], use
C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\ C{RandomRDDs.uniformRDD(sc, n, p, seed)\
.map(lambda v: a + (b - a) * v)} .map(lambda v: a + (b - a) * v)}
>>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect() >>> x = RandomRDDs.uniformRDD(sc, 100).collect()
>>> len(x) >>> len(x)
100 100
>>> max(x) <= 1.0 and min(x) >= 0.0 >>> max(x) <= 1.0 and min(x) >= 0.0
True True
>>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions() >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
4 4
>>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
>>> parts == sc.defaultParallelism >>> parts == sc.defaultParallelism
True True
""" """
...@@ -66,10 +65,10 @@ class RandomRDDGenerators: ...@@ -66,10 +65,10 @@ class RandomRDDGenerators:
To transform the distribution in the generated RDD from standard normal To transform the distribution in the generated RDD from standard normal
to some other normal N(mean, sigma), use to some other normal N(mean, sigma), use
C{RandomRDDGenerators.normal(sc, n, p, seed)\ C{RandomRDDs.normal(sc, n, p, seed)\
.map(lambda v: mean + sigma * v)} .map(lambda v: mean + sigma * v)}
>>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L) >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
>>> stats = x.stats() >>> stats = x.stats()
>>> stats.count() >>> stats.count()
1000L 1000L
...@@ -89,7 +88,7 @@ class RandomRDDGenerators: ...@@ -89,7 +88,7 @@ class RandomRDDGenerators:
distribution with the input mean. distribution with the input mean.
>>> mean = 100.0 >>> mean = 100.0
>>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L) >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L)
>>> stats = x.stats() >>> stats = x.stats()
>>> stats.count() >>> stats.count()
1000L 1000L
...@@ -110,12 +109,12 @@ class RandomRDDGenerators: ...@@ -110,12 +109,12 @@ class RandomRDDGenerators:
from the uniform distribution on [0.0 1.0]. from the uniform distribution on [0.0 1.0].
>>> import numpy as np >>> import numpy as np
>>> mat = np.matrix(RandomRDDGenerators.uniformVectorRDD(sc, 10, 10).collect()) >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape >>> mat.shape
(10, 10) (10, 10)
>>> mat.max() <= 1.0 and mat.min() >= 0.0 >>> mat.max() <= 1.0 and mat.min() >= 0.0
True True
>>> RandomRDDGenerators.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
4 4
""" """
jrdd = sc._jvm.PythonMLLibAPI() \ jrdd = sc._jvm.PythonMLLibAPI() \
...@@ -130,7 +129,7 @@ class RandomRDDGenerators: ...@@ -130,7 +129,7 @@ class RandomRDDGenerators:
from the standard normal distribution. from the standard normal distribution.
>>> import numpy as np >>> import numpy as np
>>> mat = np.matrix(RandomRDDGenerators.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
>>> mat.shape >>> mat.shape
(100, 100) (100, 100)
>>> abs(mat.mean() - 0.0) < 0.1 >>> abs(mat.mean() - 0.0) < 0.1
...@@ -151,7 +150,7 @@ class RandomRDDGenerators: ...@@ -151,7 +150,7 @@ class RandomRDDGenerators:
>>> import numpy as np >>> import numpy as np
>>> mean = 100.0 >>> mean = 100.0
>>> rdd = RandomRDDGenerators.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
>>> mat = np.mat(rdd.collect()) >>> mat = np.mat(rdd.collect())
>>> mat.shape >>> mat.shape
(100, 100) (100, 100)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment