Skip to content
Snippets Groups Projects
Commit 7e9d9756 authored by Michelangelo D'Agostino's avatar Michelangelo D'Agostino Committed by Xiangrui Meng
Browse files

[MLLIB] [PYTHON] SPARK-4221: Expose nonnegative ALS in the python API

SPARK-1553 added alternating nonnegative least squares to MLLib, however it's not possible to access it via the python API.  This pull request resolves that.

Author: Michelangelo D'Agostino <mdagostino@civisanalytics.com>

Closes #3095 from mdagost/python_nmf and squashes the following commits:

a6743ad [Michelangelo D'Agostino] Use setters instead of static methods in PythonMLLibAPI.  Remove the new static methods I added.  Set seed in tests.  Change ratings to ratingsRDD in both train and trainImplicit for consistency.
7cffd39 [Michelangelo D'Agostino] Swapped nonnegative and seed in a few more places.
3fdc851 [Michelangelo D'Agostino] Moved seed to the end of the python parameter list.
bdcc154 [Michelangelo D'Agostino] Change seed type to java.lang.Long so that it can handle null.
cedf043 [Michelangelo D'Agostino] Added in ability to set the seed from python and made that play nice with the nonnegative changes.  Also made the python ALS tests more exact.
a72fdc9 [Michelangelo D'Agostino] Expose nonnegative ALS in the python API.
parent 77791097
No related branches found
No related tags found
No related merge requests found
......@@ -275,12 +275,25 @@ class PythonMLLibAPI extends Serializable {
* the Py4J documentation.
*/
def trainALSModel(
ratings: JavaRDD[Rating],
ratingsJRDD: JavaRDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
blocks: Int): MatrixFactorizationModel = {
new MatrixFactorizationModelWrapper(ALS.train(ratings.rdd, rank, iterations, lambda, blocks))
blocks: Int,
nonnegative: Boolean,
seed: java.lang.Long): MatrixFactorizationModel = {
val als = new ALS()
.setRank(rank)
.setIterations(iterations)
.setLambda(lambda)
.setBlocks(blocks)
.setNonnegative(nonnegative)
if (seed != null) als.setSeed(seed)
val model = als.run(ratingsJRDD.rdd)
new MatrixFactorizationModelWrapper(model)
}
/**
......@@ -295,9 +308,23 @@ class PythonMLLibAPI extends Serializable {
iterations: Int,
lambda: Double,
blocks: Int,
alpha: Double): MatrixFactorizationModel = {
new MatrixFactorizationModelWrapper(
ALS.trainImplicit(ratingsJRDD.rdd, rank, iterations, lambda, blocks, alpha))
alpha: Double,
nonnegative: Boolean,
seed: java.lang.Long): MatrixFactorizationModel = {
val als = new ALS()
.setImplicitPrefs(true)
.setRank(rank)
.setIterations(iterations)
.setLambda(lambda)
.setBlocks(blocks)
.setAlpha(alpha)
.setNonnegative(nonnegative)
if (seed != null) als.setSeed(seed)
val model = als.run(ratingsJRDD.rdd)
new MatrixFactorizationModelWrapper(model)
}
/**
......
......@@ -44,31 +44,39 @@ class MatrixFactorizationModel(JavaModelWrapper):
>>> r2 = (1, 2, 2.0)
>>> r3 = (2, 1, 2.0)
>>> ratings = sc.parallelize([r1, r2, r3])
>>> model = ALS.trainImplicit(ratings, 1)
>>> model.predict(2,2) is not None
True
>>> model = ALS.trainImplicit(ratings, 1, seed=10)
>>> model.predict(2,2)
0.4473...
>>> testset = sc.parallelize([(1, 2), (1, 1)])
>>> model = ALS.train(ratings, 1)
>>> model.predictAll(testset).count() == 2
True
>>> model = ALS.train(ratings, 1, seed=10)
>>> model.predictAll(testset).collect()
[Rating(1, 1, 1), Rating(1, 2, 1)]
>>> model = ALS.train(ratings, 4)
>>> model.userFeatures().count() == 2
True
>>> model = ALS.train(ratings, 4, seed=10)
>>> model.userFeatures().collect()
[(2, array('d', [...])), (1, array('d', [...]))]
>>> first_user = model.userFeatures().take(1)[0]
>>> latents = first_user[1]
>>> len(latents) == 4
True
>>> model.productFeatures().count() == 2
True
>>> model.productFeatures().collect()
[(2, array('d', [...])), (1, array('d', [...]))]
>>> first_product = model.productFeatures().take(1)[0]
>>> latents = first_product[1]
>>> len(latents) == 4
True
>>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
>>> model.predict(2,2)
3.735...
>>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10)
>>> model.predict(2,2)
0.4473...
"""
def predict(self, user, product):
return self._java_model.predict(user, product)
......@@ -101,15 +109,17 @@ class ALS(object):
return _to_java_object_rdd(ratings, True)
@classmethod
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
seed=None):
model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
lambda_, blocks)
lambda_, blocks, nonnegative, seed)
return MatrixFactorizationModel(model)
@classmethod
def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):
def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01,
nonnegative=False, seed=None):
model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank,
iterations, lambda_, blocks, alpha)
iterations, lambda_, blocks, alpha, nonnegative, seed)
return MatrixFactorizationModel(model)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment