diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index bccf8e7f636f1bc3f8d454e683645637c26e892c..235cee48bc6a68f5e2d39f480fd920b2ac1fd14f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1434,7 +1434,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs") + self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 88ac7e275e38639d2e213130acbc042a0f3de290..66fb00508522e9924071268faa67ca04fa126b38 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -745,7 +745,13 @@ class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): WARNING: This involves collecting a large :py:func:`topicsMatrix` to the driver. """ - return LocalLDAModel(self._call_java("toLocal")) + model = LocalLDAModel(self._call_java("toLocal")) + + # SPARK-10931: Temporary fix to be removed once LDAModel defines Params + model._create_params_from_java() + model._transfer_params_from_java() + + return model @since("2.0.0") def trainingLogLikelihood(self): diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 0495973d2f6254350097a0c26a8915aef322b86c..6076b3c2f26a63d3a59d2075625dab31c71c9023 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -455,6 +455,54 @@ class ParamTests(PySparkTestCase): LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] ) + @staticmethod + def check_params(test_self, py_stage, check_params_exist=True): + """ + Checks common requirements for Params.params: + - set of params exist in Java and Python and are ordered by names + - param parent has the same UID as the object's UID + - default param value from Java matches value in Python + - optionally check if all params from Java also exist in Python + """ + py_stage_str = "%s %s" % (type(py_stage), py_stage) + if not hasattr(py_stage, "_to_java"): + return + java_stage = py_stage._to_java() + if java_stage is None: + return + test_self.assertEqual(py_stage.uid, java_stage.uid(), msg=py_stage_str) + if check_params_exist: + param_names = [p.name for p in py_stage.params] + java_params = list(java_stage.params()) + java_param_names = [jp.name() for jp in java_params] + test_self.assertEqual( + param_names, sorted(java_param_names), + "Param list in Python does not match Java for %s:\nJava = %s\nPython = %s" + % (py_stage_str, java_param_names, param_names)) + for p in py_stage.params: + test_self.assertEqual(p.parent, py_stage.uid) + java_param = java_stage.getParam(p.name) + py_has_default = py_stage.hasDefault(p) + java_has_default = java_stage.hasDefault(java_param) + test_self.assertEqual(py_has_default, java_has_default, + "Default value mismatch of param %s for Params %s" + % (p.name, str(py_stage))) + if py_has_default: + if p.name == "seed": + continue # Random seeds between Spark and PySpark are different + java_default = _java2py(test_self.sc, + java_stage.clear(java_param).getOrDefault(java_param)) + py_stage._clear(p) + py_default = py_stage.getOrDefault(p) + # equality test for NaN is always False + if isinstance(java_default, float) and np.isnan(java_default): + java_default = "NaN" + py_default = "NaN" if np.isnan(py_default) else "not NaN" + test_self.assertEqual( + java_default, py_default, + "Java default %s != python default %s of param %s for Params %s" + % (str(java_default), str(py_default), p.name, str(py_stage))) + class EvaluatorTests(SparkSessionTestCase): @@ -511,6 +559,8 @@ class FeatureTests(SparkSessionTestCase): "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) + # Test that parameters transferred to Python Model + ParamTests.check_params(self, idf0m) def test_ngram(self): dataset = self.spark.createDataFrame([ @@ -1656,40 +1706,6 @@ class DefaultValuesTests(PySparkTestCase): those in their Scala counterparts. """ - def check_params(self, py_stage): - import pyspark.ml.feature - if not hasattr(py_stage, "_to_java"): - return - java_stage = py_stage._to_java() - if java_stage is None: - return - for p in py_stage.params: - java_param = java_stage.getParam(p.name) - py_has_default = py_stage.hasDefault(p) - java_has_default = java_stage.hasDefault(java_param) - self.assertEqual(py_has_default, java_has_default, - "Default value mismatch of param %s for Params %s" - % (p.name, str(py_stage))) - if py_has_default: - if p.name == "seed": - return # Random seeds between Spark and PySpark are different - java_default =\ - _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param)) - py_stage._clear(p) - py_default = py_stage.getOrDefault(p) - if isinstance(py_stage, pyspark.ml.feature.Imputer) and p.name == "missingValue": - # SPARK-15040 - default value for Imputer param 'missingValue' is NaN, - # and NaN != NaN, so handle it specially here - import math - self.assertTrue(math.isnan(java_default) and math.isnan(py_default), - "Java default %s and python default %s are not both NaN for " - "param %s for Params %s" - % (str(java_default), str(py_default), p.name, str(py_stage))) - return - self.assertEqual(java_default, py_default, - "Java default %s != python default %s of param %s for Params %s" - % (str(java_default), str(py_default), p.name, str(py_stage))) - def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification @@ -1703,7 +1719,8 @@ class DefaultValuesTests(PySparkTestCase): for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and issubclass(cls, JavaParams)\ and not inspect.isabstract(cls): - self.check_params(cls()) + # NOTE: disable check_params_exist until there is parity with Scala API + ParamTests.check_params(self, cls(), check_params_exist=False) def _squared_distance(a, b): diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index ee6301ef19a43870bb31499b572ee82c2f3a3fd4..0f846fbc5b5ef32930b3501fd62de0d50b7f8237 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -135,6 +135,20 @@ class JavaParams(JavaWrapper, Params): paramMap.put([pair]) return paramMap + def _create_params_from_java(self): + """ + SPARK-10931: Temporary fix to create params that are defined in the Java obj but not here + """ + java_params = list(self._java_obj.params()) + from pyspark.ml.param import Param + for java_param in java_params: + java_param_name = java_param.name() + if not hasattr(self, java_param_name): + param = Param(self, java_param_name, java_param.doc()) + setattr(param, "created_from_java_param", True) + setattr(self, java_param_name, param) + self._params = None # need to reset so self.params will discover new params + def _transfer_params_from_java(self): """ Transforms the embedded params from the companion Java object. @@ -147,6 +161,10 @@ class JavaParams(JavaWrapper, Params): if self._java_obj.isSet(java_param): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value}) + # SPARK-10931: Temporary fix for params that have a default in Java + if self._java_obj.hasDefault(java_param) and not self.isDefined(param): + value = _java2py(sc, self._java_obj.getDefault(java_param)).get() + self._setDefault(**{param.name: value}) def _transfer_param_map_from_java(self, javaParamMap): """ @@ -204,6 +222,11 @@ class JavaParams(JavaWrapper, Params): # Load information from java_stage to the instance. py_stage = py_type() py_stage._java_obj = java_stage + + # SPARK-10931: Temporary fix so that persisted models would own params from Estimator + if issubclass(py_type, JavaModel): + py_stage._create_params_from_java() + py_stage._resetUid(java_stage.uid()) py_stage._transfer_params_from_java() elif hasattr(py_type, "_from_java"): @@ -263,7 +286,8 @@ class JavaEstimator(JavaParams, Estimator): def _fit(self, dataset): java_model = self._fit_java(dataset) - return self._create_model(java_model) + model = self._create_model(java_model) + return self._copyValues(model) @inherit_doc @@ -307,4 +331,10 @@ class JavaModel(JavaTransformer, Model): """ super(JavaModel, self).__init__(java_model) if java_model is not None: + + # SPARK-10931: This is a temporary fix to allow models to own params + # from estimators. Eventually, these params should be in models through + # using common base classes between estimators and models. + self._create_params_from_java() + self._resetUid(java_model.uid())