diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala index be19179b00a49399f0cd727457474cf4c1c5fd1a..5f14102c3c36670af045b5349be7cca06519c82f 100644 --- a/core/src/main/scala/org/apache/spark/SSLOptions.scala +++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala @@ -150,8 +150,8 @@ private[spark] object SSLOptions extends Logging { * $ - `[ns].enabledAlgorithms` - a comma separated list of ciphers * * For a list of protocols and ciphers supported by particular Java versions, you may go to - * [[https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https Oracle - * blog page]]. + * <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https"> + * Oracle blog page</a>. * * You can optionally specify the default configuration. If you do, for each setting which is * missing in SparkConf, the corresponding setting is used from the default configuration. diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index bff5a29bb60f173d589ef5cec52d63dd4559f92f..d7e3a1b1be48c2f3f2840cbc209c2bf048dc3d87 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -405,7 +405,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * partitioning of the resulting key-value pair RDD by passing a Partitioner. * * @note If you are grouping in order to perform an aggregation (such as a sum or average) over - * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey` * will provide much better performance. */ def groupByKey(partitioner: Partitioner): JavaPairRDD[K, JIterable[V]] = @@ -416,7 +416,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * resulting RDD with into `numPartitions` partitions. * * @note If you are grouping in order to perform an aggregation (such as a sum or average) over - * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey` * will provide much better performance. */ def groupByKey(numPartitions: Int): JavaPairRDD[K, JIterable[V]] = @@ -546,7 +546,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * resulting RDD with the existing partitioner/parallelism level. * * @note If you are grouping in order to perform an aggregation (such as a sum or average) over - * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey` * will provide much better performance. */ def groupByKey(): JavaPairRDD[K, JIterable[V]] = diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala index ccd94f876e0b8cc4c3b08465a14c77404c244e30..a20d264be5afdf21f90f96e8e87d929a928dd00e 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala @@ -103,10 +103,10 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be >= 0 * * @note This is NOT guaranteed to provide exactly the fraction of the count - * of the given [[RDD]]. + * of the given `RDD`. */ def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] = sample(withReplacement, fraction, Utils.random.nextLong) @@ -117,11 +117,11 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be >= 0 * @param seed seed for the random number generator * * @note This is NOT guaranteed to provide exactly the fraction of the count - * of the given [[RDD]]. + * of the given `RDD`. */ def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] = wrapRDD(rdd.sample(withReplacement, fraction, seed)) @@ -167,7 +167,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * Return an RDD with the elements from `this` that are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting - * RDD will be <= us. + * RDD will be <= us. */ def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other)) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 38d347aeab8c6f71afeed5aaaba781cef63e248e..9481156bc93a5e6bbe5eeba9bad61d5f6efd198d 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -238,7 +238,9 @@ class JavaSparkContext(val sc: SparkContext) * }}} * * Do - * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`, + * {{{ + * JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path") + * }}} * * then `rdd` contains * {{{ @@ -270,7 +272,9 @@ class JavaSparkContext(val sc: SparkContext) * }}} * * Do - * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`, + * {{{ + * JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path") + * }}}, * * then `rdd` contains * {{{ @@ -749,7 +753,7 @@ class JavaSparkContext(val sc: SparkContext) /** * Get a local property set in this thread, or null if it is missing. See - * [[org.apache.spark.api.java.JavaSparkContext.setLocalProperty]]. + * `org.apache.spark.api.java.JavaSparkContext.setLocalProperty`. */ def getLocalProperty(key: String): String = sc.getLocalProperty(key) @@ -769,7 +773,7 @@ class JavaSparkContext(val sc: SparkContext) * Application programmers can use this method to group all those jobs together and give a * group description. Once set, the Spark web UI will associate such jobs with this group. * - * The application can also use [[org.apache.spark.api.java.JavaSparkContext.cancelJobGroup]] + * The application can also use `org.apache.spark.api.java.JavaSparkContext.cancelJobGroup` * to cancel all running jobs in this group. For example, * {{{ * // In the main thread: @@ -802,7 +806,7 @@ class JavaSparkContext(val sc: SparkContext) /** * Cancel active jobs for the specified group. See - * [[org.apache.spark.api.java.JavaSparkContext.setJobGroup]] for more information. + * `org.apache.spark.api.java.JavaSparkContext.setJobGroup` for more information. */ def cancelJobGroup(groupId: String): Unit = sc.cancelJobGroup(groupId) diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index 6ba79e506a648024c1a736fd76382bd71d790ea9..2e991ce394c42bb59feffac1a82ef34637ab82fe 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -172,7 +172,7 @@ private final object SnappyCompressionCodec { } /** - * Wrapper over [[SnappyOutputStream]] which guards against write-after-close and double-close + * Wrapper over `SnappyOutputStream` which guards against write-after-close and double-close * issues. See SPARK-7660 for more details. This wrapping can be removed if we upgrade to a version * of snappy-java that contains the fix for https://github.com/xerial/snappy-java/issues/107. */ diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index bff2b8f1d06c95338ea8adb8a4597ffc96d1853d..8e673447581cfe16f2f4def8fef9462fe03ebeaa 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -70,8 +70,8 @@ import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, Poi * All of the scheduling and execution in Spark is done based on these methods, allowing each RDD * to implement its own way of computing itself. Indeed, users can implement custom RDDs (e.g. for * reading data from a new storage system) by overriding these functions. Please refer to the - * [[http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf Spark paper]] for more details - * on RDD internals. + * <a href="http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf">Spark paper</a> + * for more details on RDD internals. */ abstract class RDD[T: ClassTag]( @transient private var _sc: SparkContext, @@ -469,7 +469,7 @@ abstract class RDD[T: ClassTag]( * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be >= 0 * @param seed seed for the random number generator * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -675,8 +675,8 @@ abstract class RDD[T: ClassTag]( * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] = withScope { groupBy[K](f, defaultPartitioner(this)) @@ -688,8 +688,8 @@ abstract class RDD[T: ClassTag]( * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K]( f: T => K, @@ -703,8 +703,8 @@ abstract class RDD[T: ClassTag]( * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K](f: T => K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K] = null) : RDD[(K, Iterable[T])] = withScope { diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala index 8f15f50bee8146b5ec7ef3b9c3a5edb34bf88bd5..f41fc38be20805c4c0c040bb1327770f5aa2141e 100644 --- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala +++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala @@ -46,7 +46,7 @@ private[spark] object CryptoStreamUtils extends Logging { val COMMONS_CRYPTO_CONF_PREFIX = "commons.crypto." /** - * Helper method to wrap [[OutputStream]] with [[CryptoOutputStream]] for encryption. + * Helper method to wrap `OutputStream` with `CryptoOutputStream` for encryption. */ def createCryptoOutputStream( os: OutputStream, @@ -62,7 +62,7 @@ private[spark] object CryptoStreamUtils extends Logging { } /** - * Helper method to wrap [[InputStream]] with [[CryptoInputStream]] for decryption. + * Helper method to wrap `InputStream` with `CryptoInputStream` for decryption. */ def createCryptoInputStream( is: InputStream, diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 19e020c968a9a0fc46b8c4246eed065d76e9482f..7eb2da1c2748c22191f43d6be07a99dabc71c697 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -43,7 +43,8 @@ import org.apache.spark.util.{BoundedPriorityQueue, SerializableConfiguration, S import org.apache.spark.util.collection.CompactBuffer /** - * A Spark serializer that uses the [[https://code.google.com/p/kryo/ Kryo serialization library]]. + * A Spark serializer that uses the <a href="https://code.google.com/p/kryo/"> + * Kryo serialization library</a>. * * @note This serializer is not guaranteed to be wire-compatible across different versions of * Spark. It is intended to be used to serialize/de-serialize data within a single diff --git a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala index bf087af16a5b1400b9ec3c34eaa1901f1f1aaee4..bb8a684b4c7a8e3ea0753e12628df089befc5b45 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala @@ -89,17 +89,18 @@ class RandomBlockReplicationPolicy prioritizedPeers } + // scalastyle:off line.size.limit /** * Uses sampling algorithm by Robert Floyd. Finds a random sample in O(n) while - * minimizing space usage - * [[http://math.stackexchange.com/questions/178690/ - * whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin]] + * minimizing space usage. Please see <a href="http://math.stackexchange.com/questions/178690/whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin"> + * here</a>. * * @param n total number of indices * @param m number of samples needed * @param r random number generator * @return list of m random unique indices */ + // scalastyle:on line.size.limit private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = { val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) => val t = r.nextInt(i) + 1 diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 57f6f2f0a9be598c2954d27f0894e0030f9a7d2a..dbeb970c81dfeed51f1c86dc3558359e6e1d7134 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -422,8 +422,8 @@ private[spark] object UIUtils extends Logging { * the whole string will rendered as a simple escaped text. * * Note: In terms of security, only anchor tags with root relative links are supported. So any - * attempts to embed links outside Spark UI, or other tags like <script> will cause in the whole - * description to be treated as plain text. + * attempts to embed links outside Spark UI, or other tags like <script> will cause in + * the whole description to be treated as plain text. * * @param desc the original job or stage description string, which may contain html tags. * @param basePathUri with which to prepend the relative links; this is used when plainText is diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala index 1326f0977c241734bc8cb903ca6d08f2eb2f7bdd..00e0cf257cd4ac82f912dcfbccefd5e026e033ae 100644 --- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala +++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala @@ -224,7 +224,7 @@ private[spark] object AccumulatorContext { * Registers an [[AccumulatorV2]] created on the driver such that it can be used on the executors. * * All accumulators registered here can later be used as a container for accumulating partial - * values across multiple tasks. This is what [[org.apache.spark.scheduler.DAGScheduler]] does. + * values across multiple tasks. This is what `org.apache.spark.scheduler.DAGScheduler` does. * Note: if an accumulator is registered here, it should also be registered with the active * context cleaner for cleanup so as to avoid memory leaks. * diff --git a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala index e3b588374ce1aba72fbfd12766c8c92f0d05736c..46a5cb2cff5a5dada63839ad4b5e36b3be0155d4 100644 --- a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala @@ -23,7 +23,7 @@ import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv, RpcTimeout} private[spark] object RpcUtils { /** - * Retrieve a [[RpcEndpointRef]] which is located in the driver via its name. + * Retrieve a `RpcEndpointRef` which is located in the driver via its name. */ def makeDriverRef(name: String, conf: SparkConf, rpcEnv: RpcEnv): RpcEndpointRef = { val driverHost: String = conf.get("spark.driver.host", "localhost") diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala index 45381365f1e522750f6271e705c7c0c1dcd63820..1e02638591f8b0a8eb2718be39de01bce366e640 100644 --- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala +++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala @@ -22,8 +22,8 @@ import org.apache.spark.annotation.Since /** * A class for tracking the statistics of a set of numbers (count, mean and variance) in a * numerically robust way. Includes support for merging two StatCounters. Based on Welford - * and Chan's [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance algorithms]] - * for running variance. + * and Chan's <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance"> + * algorithms</a> for running variance. * * @constructor Initialize the StatCounter with the given values. */ diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index d093e7bfc3dac14af38c8e2292f842bd3a79873f..60a6e82c6f90d33de43e5a55005f25508451b6aa 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -180,8 +180,8 @@ private[spark] object ThreadUtils { // scalastyle:off awaitresult /** - * Preferred alternative to [[Await.result()]]. This method wraps and re-throws any exceptions - * thrown by the underlying [[Await]] call, ensuring that this thread's stack trace appears in + * Preferred alternative to `Await.result()`. This method wraps and re-throws any exceptions + * thrown by the underlying `Await` call, ensuring that this thread's stack trace appears in * logs. */ @throws(classOf[SparkException]) @@ -196,7 +196,7 @@ private[spark] object ThreadUtils { } /** - * Calls [[Awaitable.result]] directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps + * Calls `Awaitable.result` directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps * and re-throws any exceptions with nice stack track. * * Codes running in the user's thread may be in a thread of Scala ForkJoinPool. As concurrent diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a2386d6b9e12ff1cfabd117add4fd9677c85bbad..acad2fdf733c87ff98d37522d0d111269971d950 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1673,8 +1673,8 @@ private[spark] object Utils extends Logging { } /** - * NaN-safe version of [[java.lang.Double.compare()]] which allows NaN values to be compared - * according to semantics where NaN == NaN and NaN > any non-NaN double. + * NaN-safe version of `java.lang.Double.compare()` which allows NaN values to be compared + * according to semantics where NaN == NaN and NaN > any non-NaN double. */ def nanSafeCompareDoubles(x: Double, y: Double): Int = { val xIsNan: Boolean = java.lang.Double.isNaN(x) @@ -1687,8 +1687,8 @@ private[spark] object Utils extends Logging { } /** - * NaN-safe version of [[java.lang.Float.compare()]] which allows NaN values to be compared - * according to semantics where NaN == NaN and NaN > any non-NaN float. + * NaN-safe version of `java.lang.Float.compare()` which allows NaN values to be compared + * according to semantics where NaN == NaN and NaN > any non-NaN float. */ def nanSafeCompareFloats(x: Float, y: Float): Int = { val xIsNan: Boolean = java.lang.Float.isNaN(x) @@ -2340,7 +2340,7 @@ private[spark] object Utils extends Logging { * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains * host and port. * - * @throws SparkException if `sparkUrl` is invalid. + * @note Throws `SparkException` if sparkUrl is invalid. */ def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = { try { diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala index 89b0874e3865a63daa08286f413f0206e8dcb636..da08661d137d0c11bd14fb07e4fb2d8179b7ec1e 100644 --- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala +++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala @@ -148,7 +148,7 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) { /** * Reads data from a ChunkedByteBuffer. * - * @param dispose if true, [[ChunkedByteBuffer.dispose()]] will be called at the end of the stream + * @param dispose if true, `ChunkedByteBuffer.dispose()` will be called at the end of the stream * in order to close any memory-mapped files which back the buffer. */ private class ChunkedByteBufferInputStream( diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala index 922ec7955fd6d6875fa3c96c453aac6352ea0952..c55a5885ba80568b540dc178b62af4304eb89731 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala @@ -54,8 +54,8 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * * @return an RDD containing the edges in this graph * - * @see [[Edge]] for the edge type. - * @see [[Graph#triplets]] to get an RDD which contains all the edges + * @see `Edge` for the edge type. + * @see `Graph#triplets` to get an RDD which contains all the edges * along with their vertex data. * */ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala index f678e5f1238fb0c1c39a67e199321f488e1101b3..add21f41ea3ba8d0fa888070bb0f3d849f7eb719 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala @@ -32,7 +32,7 @@ object GraphLoader extends Logging { * id and a target id. Skips lines that begin with `#`. * * If desired the edges can be automatically oriented in the positive - * direction (source Id < target Id) by setting `canonicalOrientation` to + * direction (source Id < target Id) by setting `canonicalOrientation` to * true. * * @example Loads a file in the following format: diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala index 98e082cc44e1a1df464e224ac77fb402455d98b0..faa985594ec081bbe8d481b8e3ad035af72b2f9c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala @@ -41,7 +41,7 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( /** * If `partitionsRDD` already has a partitioner, use it. Otherwise assume that the - * [[PartitionID]]s in `partitionsRDD` correspond to the actual partitions and create a new + * `PartitionID`s in `partitionsRDD` correspond to the actual partitions and create a new * partitioner that allows co-partitioning with `partitionsRDD`. */ override val partitioner = diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index f926984aa6335a96030977965a648c47ef5071fd..feb3f47667f8cae625dd1a60e5fb67c16f611d61 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -28,7 +28,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} /** * PageRank algorithm implementation. There are two implementations of PageRank implemented. * - * The first implementation uses the standalone [[Graph]] interface and runs PageRank + * The first implementation uses the standalone `Graph` interface and runs PageRank * for a fixed number of iterations: * {{{ * var PR = Array.fill(n)( 1.0 ) @@ -41,7 +41,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} * } * }}} * - * The second implementation uses the [[Pregel]] interface and runs PageRank until + * The second implementation uses the `Pregel` interface and runs PageRank until * convergence: * * {{{ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala index bb2ffab0f60f8b9438bf5b3764c1e83d2485a983..59fdd855e6f3749ef82ec152bf5c29fca689099c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala @@ -42,7 +42,8 @@ object SVDPlusPlus { /** * Implement SVD++ based on "Factorization Meets the Neighborhood: * a Multifaceted Collaborative Filtering Model", - * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]]. + * available at <a href="http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf"> + * here</a>. * * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^^-0.5^^*sum(y)), * see the details on page 6. diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala index 34e9e22c3a35a92c644ab0850e0eb0f6b6eba3f5..21b22968a1a69ed0ad871e68e90cb483e68cb753 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala @@ -36,7 +36,7 @@ import org.apache.spark.graphx._ * self cycles and canonicalizes the graph to ensure that the following conditions hold: * <ul> * <li> There are no self edges</li> - * <li> All edges are oriented src > dst</li> + * <li> All edges are oriented src > dst</li> * <li> There are no duplicate edges</li> * </ul> * However, the canonicalization procedure is costly as it requires repartitioning the graph. diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala index 0be28677eff31f93f9c98333f735e55e5f631eed..3167e0c286d474b83816eebb604af26df7063d71 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala @@ -28,7 +28,8 @@ import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In * the event that the covariance matrix is singular, the density will be computed in a * reduced dimensional subspace under which the distribution is supported. - * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]]) + * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case"> + * here</a>) * * @param mean The mean vector of the distribution * @param cov The covariance matrix of the distribution diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index aa92edde7acd10b320684d970a0aec733dfb9ec5..4b43a3aa5b700e17d1d6a5727b744c1a80407ed8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -59,7 +59,7 @@ private[ml] trait PredictorParams extends Params /** * :: DeveloperApi :: * Abstraction for prediction problems (regression and classification). It accepts all NumericType - * labels and will automatically cast it to DoubleType in [[fit()]]. + * labels and will automatically cast it to DoubleType in `fit()`. * * @tparam FeaturesType Type of features. * E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features. diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala index 12b9732a4c3d2d49711c089b1971ebb1d35a95a3..527cb2d547b63e6e59f4710494530f1ebdd53075 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala @@ -239,7 +239,7 @@ object AttributeGroup { } } - /** Creates an attribute group from a [[StructField]] instance. */ + /** Creates an attribute group from a `StructField` instance. */ def fromStructField(field: StructField): AttributeGroup = { require(field.dataType == new VectorUDT) if (field.metadata.contains(ML_ATTR)) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala index 27554acdf3c26f4361415c4dcc41ee478a39bbe5..cc7e8bc301ad39f0554524e9a6922a26a67e32b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala @@ -98,7 +98,7 @@ sealed abstract class Attribute extends Serializable { def toMetadata(): Metadata = toMetadata(Metadata.empty) /** - * Converts to a [[StructField]] with some existing metadata. + * Converts to a `StructField` with some existing metadata. * @param existingMetadata existing metadata to carry over */ def toStructField(existingMetadata: Metadata): StructField = { @@ -109,7 +109,7 @@ sealed abstract class Attribute extends Serializable { StructField(name.get, DoubleType, nullable = false, newMetadata) } - /** Converts to a [[StructField]]. */ + /** Converts to a `StructField`. */ def toStructField(): StructField = toStructField(Metadata.empty) override def toString: String = toMetadataImpl(withType = true).toString diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index d07b4adebb08f12558ba9e9f5f3a16253326fe9c..fe29926e0d994962971c913278b093725a0210f9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -56,13 +56,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Set threshold in binary classification, in range [0, 1]. * - * If the estimated probability of class label 1 is > threshold, then predict 1, else 0. + * If the estimated probability of class label 1 is > threshold, then predict 1, else 0. * A high threshold encourages the model to predict 0 more often; * a low threshold encourages the model to predict 1 more often. * * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. - * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. - * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * When `setThreshold()` is called, any user-set value for `thresholds` will be cleared. + * If both `threshold` and `thresholds` are set in a ParamMap, then they must be * equivalent. * * Default is 0.5. @@ -101,12 +101,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Get threshold for binary classification. * - * If [[thresholds]] is set with length 2 (i.e., binary classification), + * If `thresholds` is set with length 2 (i.e., binary classification), * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. - * Otherwise, returns [[threshold]] if set, or its default value if unset. + * Otherwise, returns `threshold` if set, or its default value if unset. * * @group getParam - * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + * @throws IllegalArgumentException if `thresholds` is set to an array of length other than 2. */ override def getThreshold: Double = { checkThresholdConsistency() @@ -122,13 +122,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Set thresholds in multiclass (or binary) classification to adjust the probability of - * predicting each class. Array must have length equal to the number of classes, with values > 0, - * excepting that at most one value may be 0. + * predicting each class. Array must have length equal to the number of classes, + * with values > 0, excepting that at most one value may be 0. * The class with largest value p/t is predicted, where p is the original probability of that * class and t is the class's threshold. * - * Note: When [[setThresholds()]] is called, any user-set value for [[threshold]] will be cleared. - * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * Note: When `setThresholds()` is called, any user-set value for `threshold` will be cleared. + * If both `threshold` and `thresholds` are set in a ParamMap, then they must be * equivalent. * * @group setParam @@ -141,8 +141,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Get thresholds for binary or multiclass classification. * - * If [[thresholds]] is set, return its value. - * Otherwise, if [[threshold]] is set, return the equivalent thresholds for binary + * If `thresholds` is set, return its value. + * Otherwise, if `threshold` is set, return the equivalent thresholds for binary * classification: (1-threshold, threshold). * If neither are set, throw an exception. * @@ -159,9 +159,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } /** - * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent. + * If `threshold` and `thresholds` are both set, ensures they are consistent. * - * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent + * @throws IllegalArgumentException if `threshold` and `thresholds` are not equivalent */ protected def checkThresholdConsistency(): Unit = { if (isSet(threshold) && isSet(thresholds)) { @@ -207,7 +207,7 @@ class LogisticRegression @Since("1.2.0") ( /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * For 0 < alpha < 1, the penalty is a combination of L1 and L2. * Default is 0.0 which is an L2 penalty. * * @group setParam @@ -294,7 +294,7 @@ class LogisticRegression @Since("1.2.0") ( override def getThresholds: Array[Double] = super.getThresholds /** - * Suggested depth for treeAggregate (>= 2). + * Suggested depth for treeAggregate (>= 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. @@ -815,7 +815,7 @@ class LogisticRegressionModel private[spark] ( /** * Predict label for the given feature vector. - * The behavior of this can be adjusted using [[thresholds]]. + * The behavior of this can be adjusted using `thresholds`. */ override protected def predict(features: Vector): Double = if (isMultinomial) { super.predict(features) @@ -1274,7 +1274,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is: * - * <p><blockquote> + * <blockquote> * $$ * P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} * e^{\vec{x}_i^T \vec{\beta}_k}} \\ @@ -1283,7 +1283,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1} * e^{\vec{x}_i^T \vec{\beta}_k}} * $$ - * </blockquote></p> + * </blockquote> * * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not @@ -1292,7 +1292,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * Note that the coefficients in the model above lack identifiability. That is, any constant scalar * can be added to all of the coefficients and the probabilities remain the same. * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1} @@ -1302,7 +1302,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}} * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * However, when regularization is added to the loss function, the coefficients are indeed * identifiable because there is only one set of coefficients which minimizes the regularization @@ -1314,7 +1314,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * The loss of objective function for a single instance of data (we do not include the * regularization term here for simplicity) can be written as * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\ @@ -1322,14 +1322,14 @@ class BinaryLogisticRegressionSummary private[classification] ( * &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$. * * For optimization, we have to calculate the first derivative of the loss function, and a simple * calculation shows that * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}} @@ -1338,54 +1338,54 @@ class BinaryLogisticRegressionSummary private[classification] ( * &= x_{i, j} \cdot w_i \cdot multiplier_k * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function * - * <p><blockquote> + * <blockquote> * $$ * I_{y=k} = \begin{cases} * 1 & y = k \\ * 0 & else * \end{cases} * $$ - * </blockquote></p> + * </blockquote> * * and * - * <p><blockquote> + * <blockquote> * $$ * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1} * e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right) * $$ - * </blockquote></p> + * </blockquote> * * If any of margins is larger than 709.78, the numerical computation of multiplier and loss * function will suffer from arithmetic overflow. This issue occurs when there are outliers in * data which are far away from the hyperplane, and this will cause the failing of training once - * infinity is introduced. Note that this is only a concern when max(margins) > 0. + * infinity is introduced. Note that this is only a concern when max(margins) > 0. * - * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can easily - * be rewritten into the following equivalent numerically stable formula. + * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can + * easily be rewritten into the following equivalent numerically stable formula. * - * <p><blockquote> + * <blockquote> * $$ * \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) - * margins_{y} + maxMargin * $$ - * </blockquote></p> + * </blockquote> * * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a * result, overflow will not happen with this formula. * * For $multiplier$, a similar trick can be applied as the following, * - * <p><blockquote> + * <blockquote> * $$ * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1} * e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right) * $$ - * </blockquote></p> + * </blockquote> * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. @@ -1513,7 +1513,7 @@ private class LogisticAggregator( } /** - * When maxMargin > 0, the original formula could cause overflow. + * When maxMargin > 0, the original formula could cause overflow. * We address this by subtracting maxMargin from all the margins, so it's guaranteed * that all of the new margins will be smaller than zero to prevent arithmetic overflow. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 88fe7cb4a6e0f0625e7a026f8fe8d04fb1082bfb..1b45eafbaca2340bbe3508bd30381fba79c492e2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -289,7 +289,6 @@ object MultilayerPerceptronClassifier * @param uid uid * @param layers array of layer sizes including input and output layers * @param weights the weights of layers - * @return prediction model */ @Since("1.5.0") @Experimental diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index f1a7676c74b0ebbcdbbf6e244e8704a6b072c9e2..a2ac7000003d4543cb525be8a46519d1d2c31d0b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -60,16 +60,20 @@ private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol { final def getModelType: String = $(modelType) } +// scalastyle:off line.size.limit /** * Naive Bayes Classifiers. * It supports Multinomial NB - * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]]) + * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html"> + * here</a>) * which can handle finitely supported discrete data. For example, by converting documents into * TF-IDF vectors, it can be used for document classification. By making every vector a * binary (0/1) data, it can also be used as Bernoulli NB - * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]). + * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html"> + * here</a>). * The input feature values must be nonnegative. */ +// scalastyle:on line.size.limit @Since("1.5.0") class NaiveBayes @Since("1.5.0") ( @Since("1.5.0") override val uid: String) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 52345b0626c47e493c4ec7be4a842dc78c60d0ef..907c73e2e4d0ad59964daed40342a768a373e41e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.functions._ /** - * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for + * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> learning algorithm for * classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. @@ -144,7 +144,7 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi } /** - * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for classification. + * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. * @@ -249,7 +249,7 @@ class RandomForestClassificationModel private[ml] ( * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) * and follows the implementation from scikit-learn. * - * @see [[DecisionTreeClassificationModel.featureImportances]] + * @see `DecisionTreeClassificationModel.featureImportances` */ @Since("1.5.0") lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index cf11ba37abb58cbe916fc1f76f2fa96d10105251..c7a170ddc7351b0fb0068b0b4a0caa7dc0d497cd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -42,7 +42,7 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol { /** - * The desired number of leaf clusters. Must be > 1. Default: 4. + * The desired number of leaf clusters. Must be > 1. Default: 4. * The actual number could be smaller if there are no divisible leaf clusters. * @group param */ @@ -55,8 +55,8 @@ private[clustering] trait BisectingKMeansParams extends Params def getK: Int = $(k) /** - * The minimum number of points (if >= 1.0) or the minimum proportion - * of points (if < 1.0) of a divisible cluster (default: 1.0). + * The minimum number of points (if >= 1.0) or the minimum proportion + * of points (if < 1.0) of a divisible cluster (default: 1.0). * @group expertParam */ @Since("2.0.0") @@ -208,9 +208,9 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] { * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters, * larger clusters get higher priority. * - * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf - * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, - * KDD Workshop on Text Mining, 2000.]] + * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf"> + * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, + * KDD Workshop on Text Mining, 2000.</a> */ @Since("2.0.0") @Experimental @@ -296,7 +296,7 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] { * :: Experimental :: * Summary of BisectingKMeans. * - * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]. + * @param predictions `DataFrame` produced by `BisectingKMeansModel.transform()`. * @param predictionCol Name for column of predicted clusters in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala index 8b5f525194f28e45a898e3997b35eb58c9afd270..44e832b058b62bb64d54dea3443ebef35c9df4bf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.{DataFrame, Row} * :: Experimental :: * Summary of clustering algorithms. * - * @param predictions [[DataFrame]] produced by model.transform(). + * @param predictions `DataFrame` produced by model.transform(). * @param predictionCol Name for column of predicted clusters in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 19998ca44b115f02b3dcf2af6d6c5aab0e3bb4fe..74109344aac0853b9b60371dfb43ba63e9f450d3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -44,7 +44,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol { /** - * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2. + * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2. * @group param */ @Since("2.0.0") @@ -76,7 +76,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w * @param weights Weight for each Gaussian distribution in the mixture. * This is a multinomial probability distribution over the k Gaussians, * where weights(i) is the weight for Gaussian i, and weights sum to 1. - * @param gaussians Array of [[MultivariateGaussian]] where gaussians(i) represents + * @param gaussians Array of `MultivariateGaussian` where gaussians(i) represents * the Multivariate Gaussian (Normal) Distribution for Gaussian i */ @Since("2.0.0") @@ -374,7 +374,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] { * :: Experimental :: * Summary of GaussianMixture. * - * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]]. + * @param predictions `DataFrame` produced by `GaussianMixtureModel.transform()`. * @param predictionCol Name for column of predicted clusters in `predictions`. * @param probabilityCol Name for column of predicted probability of each cluster * in `predictions`. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 152bd13b7a17a4895638ff8edc630e1b397ffb7e..6e124eb6ddca01aace4fab52cf8bd96c93e9f4d0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -42,7 +42,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe with HasSeed with HasPredictionCol with HasTol { /** - * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than + * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than * k clusters to be returned, for example, if there are fewer than k distinct points to cluster. * Default: 2. * @group param @@ -72,7 +72,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe /** * Param for the number of steps for the k-means|| initialization mode. This is an advanced - * setting -- the default of 2 is almost always enough. Must be > 0. Default: 2. + * setting -- the default of 2 is almost always enough. Must be > 0. Default: 2. * @group expertParam */ @Since("1.5.0") @@ -250,7 +250,7 @@ object KMeansModel extends MLReadable[KMeansModel] { * :: Experimental :: * K-means clustering with support for k-means|| initialization proposed by Bahmani et al. * - * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]] + * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a> */ @Since("1.5.0") @Experimental @@ -346,7 +346,7 @@ object KMeans extends DefaultParamsReadable[KMeans] { * :: Experimental :: * Summary of KMeans. * - * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]]. + * @param predictions `DataFrame` produced by `KMeansModel.transform()`. * @param predictionCol Name for column of predicted clusters in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 7773802854c003f1da1460efa231fe94505cc5b0..6032ab3db93503d8e82b2d9241cd007bd4426c8f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -50,7 +50,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM with HasSeed with HasCheckpointInterval { /** - * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10. + * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10. * * @group param */ @@ -78,13 +78,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * - EM * - Currently only supports symmetric distributions, so all values in the vector should be * the same. - * - Values should be > 1.0 + * - Values should be > 1.0 * - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows * from Asuncion et al. (2009), who recommend a +1 adjustment for EM. * - Online - * - Values should be >= 0 + * - Values should be >= 0 * - default = uniformly (1.0 / k), following the implementation from - * [[https://github.com/Blei-Lab/onlineldavb]]. + * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>. * * @group param */ @@ -120,13 +120,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * * Optimizer-specific parameter settings: * - EM - * - Value should be > 1.0 + * - Value should be > 1.0 * - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows * Asuncion et al. (2009), who recommend a +1 adjustment for EM. * - Online - * - Value should be >= 0 + * - Value should be >= 0 * - default = (1.0 / k), following the implementation from - * [[https://github.com/Blei-Lab/onlineldavb]]. + * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>. * * @group param */ @@ -162,11 +162,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * - Online LDA: * Hoffman, Blei and Bach. "Online Learning for Latent Dirichlet Allocation." * Neural Information Processing Systems, 2010. - * [[http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf]] + * See <a href="http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf">here</a> * - EM: * Asuncion et al. "On Smoothing and Inference for Topic Models." * Uncertainty in Artificial Intelligence, 2009. - * [[http://arxiv.org/pdf/1205.2662.pdf]] + * See <a href="http://arxiv.org/pdf/1205.2662.pdf">here</a> * * @group param */ @@ -245,9 +245,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent, * in range (0, 1]. * - * Note that this should be adjusted in synch with [[LDA.maxIter]] + * Note that this should be adjusted in synch with `LDA.maxIter` * so the entire corpus is used. Specifically, set both so that - * maxIterations * miniBatchFraction >= 1. + * maxIterations * miniBatchFraction >= 1. * * Note: This is the same as the `miniBatchFraction` parameter in * [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer]]. @@ -293,8 +293,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * cause failures if a data partition is lost, so set this bit with care. * Note that checkpoints will be cleaned up via reference counting, regardless. * - * See [[DistributedLDAModel.getCheckpointFiles]] for getting remaining checkpoints and - * [[DistributedLDAModel.deleteCheckpointFiles]] for removing remaining checkpoints. + * See `DistributedLDAModel.getCheckpointFiles` for getting remaining checkpoints and + * `DistributedLDAModel.deleteCheckpointFiles` for removing remaining checkpoints. * * Default: true * @@ -431,7 +431,7 @@ sealed abstract class LDAModel private[ml] ( private[ml] def getEffectiveTopicConcentration: Double = getModel.topicConcentration /** - * The features for LDA should be a [[Vector]] representing the word counts in a document. + * The features for LDA should be a `Vector` representing the word counts in a document. * The vector should be of length vocabSize, with counts for each term (word). * * @group setParam @@ -650,7 +650,7 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] { * for each training document. * * @param oldLocalModelOption Used to implement [[oldLocalModel]] as a lazy val, but keeping - * [[copy()]] cheap. + * `copy()` cheap. */ @Since("1.6.0") @Experimental @@ -701,7 +701,7 @@ class DistributedLDAModel private[ml] ( * - Even with [[logPrior]], this is NOT the same as the data log likelihood given the * hyperparameters. * - This is computed from the topic distributions computed during training. If you call - * [[logLikelihood()]] on the same training dataset, the topic distributions will be computed + * `logLikelihood()` on the same training dataset, the topic distributions will be computed * again, possibly giving different results. */ @Since("1.6.0") @@ -719,7 +719,7 @@ class DistributedLDAModel private[ml] ( /** * :: DeveloperApi :: * - * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be + * If using checkpointing and `LDA.keepLastCheckpoint` is set to true, then there may be * saved checkpoint files. This method is provided so that users can manage those files. * * Note that removing the checkpoints can cause failures if a partition is lost and is needed @@ -804,13 +804,13 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] { * * Input data (featuresCol): * LDA is given a collection of documents as input data, via the featuresCol parameter. - * Each document is specified as a [[Vector]] of length vocabSize, where each entry is the + * Each document is specified as a `Vector` of length vocabSize, where each entry is the * count for the corresponding term (word) in the document. Feature transformers such as * [[org.apache.spark.ml.feature.Tokenizer]] and [[org.apache.spark.ml.feature.CountVectorizer]] * can be useful for converting text to word count vectors. * - * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation - * (Wikipedia)]] + * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation"> + * Latent Dirichlet allocation (Wikipedia)</a> */ @Since("1.6.0") @Experimental @@ -826,7 +826,7 @@ class LDA @Since("1.6.0") ( optimizeDocConcentration -> true, keepLastCheckpoint -> true) /** - * The features for LDA should be a [[Vector]] representing the word counts in a document. + * The features for LDA should be a `Vector` representing the word counts in a document. * The vector should be of length vocabSize, with counts for each term (word). * * @group setParam diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala index 6ff36b35ca4c1628b0c99c0a8475a1f954d6e898..682787a830113d9d00bd7f8f74600af42a426929 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala @@ -32,7 +32,8 @@ import org.apache.spark.sql.types.DataType * It returns a real vector of the same length representing the DCT. The return vector is scaled * such that the transform matrix is unitary (aka scaled DCT-II). * - * More information on [[https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia]]. + * More information on <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II"> + * DCT-II in Discrete cosine transform (Wikipedia)</a>. */ @Since("1.5.0") class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index d9d0f32254e246b6b6a128312a211945b8a8c2b0..f37233e1ab9c8d78a55ec696716b4ec057c7a8bf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -37,7 +37,8 @@ import org.apache.spark.sql.types.StructType * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*` * * Reference: - * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]] + * <a href="https://en.wikipedia.org/wiki/Perfect_hash_function"> + * Wikipedia on Perfect Hash Function</a> * * @param numEntries The number of entries of the hash functions. * @param randCoefficients An array of random coefficients, each used by one hash function. @@ -98,7 +99,7 @@ class MinHashModel private[ml] ( * as binary "1" values. * * References: - * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]] + * <a href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a> */ @Experimental @Since("2.1.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index ccfb0ce8f85caf6f579392e7cbf883ce9f9ac371..19978c97d2cfdcdf1b57217fbb93e2a9a26d5d9c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -78,11 +78,11 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H * statistics, which is also known as min-max normalization or Rescaling. The rescaled value for * feature E is calculated as: * - * <p><blockquote> + * <blockquote> * $$ * Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min * $$ - * </blockquote></p> + * </blockquote> * * For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 25fb6be5afd814a472d20a53a077b842d8592667..4be17da3e9f76fccd651315bd3a16ca0c982e1d2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DataType /** * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion, - * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an - * expansion of a product of sums expresses it as a sum of products by using the fact that - * multiplication distributes over addition". Take a 2-variable feature vector as an example: - * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. + * which is available at + * <a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion (Wikipedia)</a> + * , "In mathematics, an expansion of a product of sums expresses it as a sum of products by using + * the fact that multiplication distributes over addition". Take a 2-variable feature vector + * as an example: `(x, y)`, if we want to expand it with degree 2, then we get + * `(x, x * x, y, x * y, y * y)`. */ @Since("1.4.0") class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: String) @@ -76,11 +78,11 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str * (n + d choose d) (including 1 and first-order values). For example, let f([a, b, c], 3) be the * function that expands [a, b, c] to their monomials of degree 3. We have the following recursion: * - * <p><blockquote> + * <blockquote> * $$ * f([a, b, c], 3) &= f([a, b], 3) ++ f([a, b], 2) * c ++ f([a, b], 1) * c^2 ++ [c^3] * $$ - * </blockquote></p> + * </blockquote> * * To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the * current index and increment it properly for sparse input. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 1b524c6710b42deaee8ed71794081e468da97dfb..2bff59a0da1730aa2fb282b5471f8c6c940b5018 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -113,8 +113,8 @@ class RandomProjectionModel private[ml] ( * * References: * - * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions - * Wikipedia on Stable Distributions]] + * 1. <a href="https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions"> + * Wikipedia on Stable Distributions</a> * * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index d76d556280e96decc0b1d0a6ef6912d3c54c5afc..8f125d8fd51d201498d62ba3ccc1b78612b9ceda 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -79,8 +79,8 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with * statistics on the samples in the training set. * * The "unit std" is computed using the - * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation - * corrected sample standard deviation]], + * <a href="https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation"> + * corrected sample standard deviation</a>, * which is computed as the square root of the unbiased sample variance. */ @Since("1.2.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 0ced21365ff6f93cef01c27032367a40bd5fa04e..a55816249c74bfc373581b7c34defc637164f7e0 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType} * @note null values from input array are preserved unless adding null to stopWords * explicitly. * - * @see [[http://en.wikipedia.org/wiki/Stop_words]] + * @see <a href="http://en.wikipedia.org/wiki/Stop_words">Stop words (Wikipedia)</a> */ @Since("1.5.0") class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String) @@ -132,7 +132,8 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { * Loads the default stop words for the given language. * Supported languages: danish, dutch, english, finnish, french, german, hungarian, * italian, norwegian, portuguese, russian, spanish, swedish, turkish - * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] + * @see <a href="http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/"> + * here</a> */ @Since("2.0.0") def loadDefaultStopWords(language: String): Array[String] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala index b94187ae787cc9976a0146c9129f800fc17af537..5dd648aecc95c3b0855a450f57b5300f71c43fa7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala @@ -84,6 +84,7 @@ import org.apache.spark.sql.DataFrame * input dataset, while MLlib's feature transformers operate lazily on individual columns, * which is more efficient and flexible to handle large and complex datasets. * - * @see [[http://scikit-learn.org/stable/modules/preprocessing.html scikit-learn.preprocessing]] + * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html"> + * scikit-learn.preprocessing</a> */ package object feature diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala index 8a6b862cda170e6080145828114b891a8070f23a..143bf539b0afebf1b8a250cba5d38e1bd797bb23 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala @@ -50,9 +50,10 @@ private[ml] class IterativelyReweightedLeastSquaresModel( * @param maxIter maximum number of iterations. * @param tol the convergence tolerance. * - * @see [[http://www.jstor.org/stable/2345503 P. J. Green, Iteratively Reweighted Least Squares - * for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives, - * Journal of the Royal Statistical Society. Series B, 1984.]] + * @see <a href="http://www.jstor.org/stable/2345503">P. J. Green, Iteratively + * Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust + * and Resistant Alternatives, Journal of the Royal Statistical Society. + * Series B, 1984.</a> */ private[ml] class IterativelyReweightedLeastSquares( val initialModel: WeightedLeastSquaresModel, diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index fa4530927e8b04c5154372a6fa96e3896d629077..e3e03dfd43dd6e2f8f00e814e5adfe6c22b1028a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -29,7 +29,7 @@ import org.apache.spark.ml.param._ private[ml] trait HasRegParam extends Params { /** - * Param for regularization parameter (>= 0). + * Param for regularization parameter (>= 0). * @group param */ final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0)) @@ -44,7 +44,7 @@ private[ml] trait HasRegParam extends Params { private[ml] trait HasMaxIter extends Params { /** - * Param for maximum number of iterations (>= 0). + * Param for maximum number of iterations (>= 0). * @group param */ final val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)", ParamValidators.gtEq(0)) @@ -238,7 +238,7 @@ private[ml] trait HasOutputCol extends Params { private[ml] trait HasCheckpointInterval extends Params { /** - * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. + * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. * @group param */ final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1) @@ -334,7 +334,7 @@ private[ml] trait HasElasticNetParam extends Params { private[ml] trait HasTol extends Params { /** - * Param for the convergence tolerance for iterative algorithms (>= 0). + * Param for the convergence tolerance for iterative algorithms (>= 0). * @group param */ final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms (>= 0)", ParamValidators.gtEq(0)) @@ -349,7 +349,7 @@ private[ml] trait HasTol extends Params { private[ml] trait HasStepSize extends Params { /** - * Param for Step size to be used for each iteration of optimization (> 0). + * Param for Step size to be used for each iteration of optimization (> 0). * @group param */ final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0)) @@ -396,7 +396,7 @@ private[ml] trait HasSolver extends Params { private[ml] trait HasAggregationDepth extends Params { /** - * Param for suggested depth for treeAggregate (>= 2). + * Param for suggested depth for treeAggregate (>= 2). * @group expertParam */ final val aggregationDepth: IntParam = new IntParam(this, "aggregationDepth", "suggested depth for treeAggregate (>= 2)", ParamValidators.gtEq(2)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 9d5ba999781f60bf275b086490ccc3ccfa26979d..d6ad1ea6d10964163cb01f49960d45175fd15d94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -119,7 +119,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params /** * :: Experimental :: * Fit a parametric survival regression model named accelerated failure time (AFT) model - * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]]) + * (see <a href="https://en.wikipedia.org/wiki/Accelerated_failure_time_model"> + * Accelerated failure time model (Wikipedia)</a>) * based on the Weibull distribution of the survival time. */ @Experimental @@ -432,24 +433,24 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel] * Given the values of the covariates $x^{'}$, for random lifetime $t_{i}$ of subjects i = 1,..,n, * with possible right-censoring, the likelihood function under the AFT model is given as * - * <p><blockquote> + * <blockquote> * $$ * L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0} * (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0} * (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}} * $$ - * </blockquote></p> + * </blockquote> * * Where $\delta_{i}$ is the indicator of the event has occurred i.e. uncensored or not. * Using $\epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}$, the log-likelihood function * assumes the form * - * <p><blockquote> + * <blockquote> * $$ * \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+ * \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}] * $$ - * </blockquote></p> + * </blockquote> * Where $S_{0}(\epsilon_{i})$ is the baseline survivor function, * and $f_{0}(\epsilon_{i})$ is corresponding density function. * @@ -458,34 +459,34 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel] * to extreme value distribution for log of the lifetime, * and the $S_{0}(\epsilon)$ function is * - * <p><blockquote> + * <blockquote> * $$ * S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}}) * $$ - * </blockquote></p> + * </blockquote> * * and the $f_{0}(\epsilon_{i})$ function is * - * <p><blockquote> + * <blockquote> * $$ * f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}}) * $$ - * </blockquote></p> + * </blockquote> * * The log-likelihood function for Weibull distribution of lifetime is * - * <p><blockquote> + * <blockquote> * $$ * \iota(\beta,\sigma)= * -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}] * $$ - * </blockquote></p> + * </blockquote> * * Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability, * the loss function we use to optimize is $-\iota(\beta,\sigma)$. * The gradient functions for $\beta$ and $\log\sigma$ respectively are * - * <p><blockquote> + * <blockquote> * $$ * \frac{\partial (-\iota)}{\partial \beta}= * \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma} \\ @@ -493,7 +494,7 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel] * \frac{\partial (-\iota)}{\partial (\log\sigma)}= * \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}] * $$ - * </blockquote></p> + * </blockquote> * * @param bcParameters The broadcasted value includes three part: The log of scale parameter, * the intercept and regression coefficients corresponding to the features. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 1419da874709f8717a061fb7be3523228e53aa60..894b6a2ca2041578ab343406a5e614bc685990e3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -38,8 +38,8 @@ import org.apache.spark.sql.functions._ /** - * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm - * for regression. + * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a> + * learning algorithm for regression. * It supports both continuous and categorical features. */ @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index fa69d60836e68a5a17c648bef4568a183604967c..ed2d05525d611c48c65cbd706f6798e9cc0a793e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ /** - * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] + * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a> * learning algorithm for regression. * It supports both continuous and categorical features. * @@ -151,7 +151,7 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] { } /** - * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] + * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a> * model for regression. * It supports both continuous and categorical features. * @param _trees Decision trees in the ensemble. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index f33dd0fd294ba136d38c664800509e45a181c749..1201ecd5e4e61d54e646972891b8692b6a0249e2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -123,9 +123,11 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * :: Experimental :: * - * Fit a Generalized Linear Model ([[https://en.wikipedia.org/wiki/Generalized_linear_model]]) - * specified by giving a symbolic description of the linear predictor (link function) and - * a description of the error distribution (family). + * Fit a Generalized Linear Model + * (see <a href="https://en.wikipedia.org/wiki/Generalized_linear_model"> + * Generalized linear model (Wikipedia)</a>) + * specified by giving a symbolic description of the linear + * predictor (link function) and a description of the error distribution (family). * It supports "gaussian", "binomial", "poisson" and "gamma" as family. * Valid link functions for each family is listed below. The first link function of each family * is the default one. @@ -196,11 +198,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val /** * Sets the regularization parameter for L2 regularization. * The regularization term is - * <p><blockquote> + * <blockquote> * $$ * 0.5 * regParam * L2norm(coefficients)^2 * $$ - * </blockquote></p> + * </blockquote> * Default is 0.0. * * @group setParam diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 8ea5e1e6c453a557944adeda8ea59ecc57c40f05..eb4e38cc83c1933f9a490d850f2ae1d14d7410a4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -624,7 +624,8 @@ class LinearRegressionSummary private[regression] ( /** * Returns the explained variance regression score. * explainedVariance = 1 - variance(y - \hat{y}) / variance(y) - * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]] + * Reference: <a href="http://en.wikipedia.org/wiki/Explained_variation"> + * Wikipedia explain variation</a> * * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. * This will change in later Spark versions. @@ -664,7 +665,8 @@ class LinearRegressionSummary private[regression] ( /** * Returns R^2^, the coefficient of determination. - * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]] + * Reference: <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination"> + * Wikipedia coefficient of determination</a> * * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. * This will change in later Spark versions. @@ -805,11 +807,11 @@ class LinearRegressionSummary private[regression] ( * When training with intercept enabled, * The objective function in the scaled space is given by * - * <p><blockquote> + * <blockquote> * $$ * L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2, * $$ - * </blockquote></p> + * </blockquote> * * where $\bar{x_i}$ is the mean of $x_i$, $\hat{x_i}$ is the standard deviation of $x_i$, * $\bar{y}$ is the mean of label, and $\hat{y}$ is the standard deviation of label. @@ -820,7 +822,7 @@ class LinearRegressionSummary private[regression] ( * * This can be rewritten as * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * L &= 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y} @@ -828,34 +830,34 @@ class LinearRegressionSummary private[regression] ( * &= 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2 * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where $w_i^\prime$ is the effective coefficients defined by $w_i/\hat{x_i}$, offset is * - * <p><blockquote> + * <blockquote> * $$ * - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}. * $$ - * </blockquote></p> + * </blockquote> * * and diff is * - * <p><blockquote> + * <blockquote> * $$ * \sum_i w_i^\prime x_i - y / \hat{y} + offset * $$ - * </blockquote></p> + * </blockquote> * * Note that the effective coefficients and offset don't depend on training dataset, * so they can be precomputed. * * Now, the first derivative of the objective function in scaled space is * - * <p><blockquote> + * <blockquote> * $$ * \frac{\partial L}{\partial w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i} * $$ - * </blockquote></p> + * </blockquote> * * However, $(x_i - \bar{x_i})$ will densify the computation, so it's not * an ideal formula when the training dataset is sparse format. @@ -865,7 +867,7 @@ class LinearRegressionSummary private[regression] ( * objective function from all the samples is * * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * \frac{\partial L}{\partial w_i} &= @@ -874,14 +876,14 @@ class LinearRegressionSummary private[regression] ( * &= 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i) * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where $correction_i = - diffSum \bar{x_i} / \hat{x_i}$ * * A simple math can show that diffSum is actually zero, so we don't even * need to add the correction terms in the end. From the definition of diff, * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * diffSum &= \sum_j (\sum_i w_i(x_{ij} - \bar{x_i}) @@ -890,17 +892,17 @@ class LinearRegressionSummary private[regression] ( * &= 0 * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * As a result, the first derivative of the total objective function only depends on * the training dataset, which can be easily computed in distributed fashion, and is * sparse format friendly. * - * <p><blockquote> + * <blockquote> * $$ * \frac{\partial L}{\partial w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) * $$ - * </blockquote></p> + * </blockquote> * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param labelStd The standard deviation value of the label. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 0ad00aa6f92804e26bfc20879f8e0f74348e6ec4..d60f05eed58d949e69ef31fbe9aba2985de5237e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -37,7 +37,8 @@ import org.apache.spark.sql.functions._ /** - * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for regression. + * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> + * learning algorithm for regression. * It supports both continuous and categorical features. */ @Since("1.4.0") @@ -132,7 +133,7 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor } /** - * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for regression. + * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for regression. * It supports both continuous and categorical features. * * @param _trees Decision trees in the ensemble. diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala index e1376927030e45f903b9162affb4c0be05e7173c..e4de8483cfa3c10b87d6152f55e7bea572480dde 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala @@ -17,15 +17,12 @@ package org.apache.spark.ml.source.libsvm -import org.apache.spark.ml.linalg.Vector -import org.apache.spark.sql.{DataFrame, DataFrameReader} - /** - * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]]. - * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and - * `features` containing feature vectors stored as [[Vector]]s. + * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as `DataFrame`. + * The loaded `DataFrame` has two columns: `label` containing labels stored as doubles and + * `features` containing feature vectors stored as `Vector`s. * - * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and + * To use LIBSVM data source, you need to set "libsvm" as the format in `DataFrameReader` and * optionally specify options, for example: * {{{ * // Scala @@ -51,6 +48,6 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader} * @note This class is public for documentation purpose. Please don't use this class directly. * Rather, use the data source API as illustrated above. * - * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]] + * @see <a href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">LIBSVM datasets</a> */ class LibSVMDataSource private() {} diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala index 0a0bc4c006389543b3cd3dd349b4cd9479998b8f..f3bace818157098a06782a6241890b505b183efc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala @@ -34,7 +34,7 @@ private[spark] object GradientBoostedTrees extends Logging { /** * Method to train a gradient boosting model - * @param input Training dataset: RDD of [[LabeledPoint]]. + * @param input Training dataset: RDD of `LabeledPoint`. * @param seed Random seed. * @return tuple of ensemble models and weights: * (array of decision tree models, array of model weights) @@ -59,12 +59,12 @@ private[spark] object GradientBoostedTrees extends Logging { /** * Method to validate a gradient boosting model - * @param input Training dataset: RDD of [[LabeledPoint]]. + * @param input Training dataset: RDD of `LabeledPoint`. * @param validationInput Validation dataset. * This dataset should be different from the training dataset, * but it should follow the same distribution. * E.g., these two datasets could be created from an original dataset - * by using [[org.apache.spark.rdd.RDD.randomSplit()]] + * by using `org.apache.spark.rdd.RDD.randomSplit()` * @param seed Random seed. * @return tuple of ensemble models and weights: * (array of decision tree models, array of model weights) @@ -162,7 +162,7 @@ private[spark] object GradientBoostedTrees extends Logging { * Method to calculate error of the base learner for the gradient boosting calculation. * Note: This method is not used by the gradient boosting algorithm but is useful for debugging * purposes. - * @param data Training dataset: RDD of [[LabeledPoint]]. + * @param data Training dataset: RDD of `LabeledPoint`. * @param trees Boosted Decision Tree models * @param treeWeights Learning rates at each boosting iteration. * @param loss evaluation metric. @@ -184,7 +184,7 @@ private[spark] object GradientBoostedTrees extends Logging { /** * Method to compute error or loss for every iteration of gradient boosting. * - * @param data RDD of [[LabeledPoint]] + * @param data RDD of `LabeledPoint` * @param trees Boosted Decision Tree models * @param treeWeights Learning rates at each boosting iteration. * @param loss evaluation metric. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index 8ae5ca3c84b0e2591a7efe306c3f99391b8c8232..a61ea374cbd46fc4aecba433e044862b65415aee 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -82,7 +82,7 @@ private[spark] object RandomForest extends Logging { /** * Train a random forest. * - * @param input Training data: RDD of [[LabeledPoint]] + * @param input Training data: RDD of `LabeledPoint` * @return an unweighted set of trees */ def run( diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 5a551533be9ca30cfedb992e4663632a56da1037..40510ad804ef0dbacde708b337e8b8160fc9b21f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -342,9 +342,9 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { * - sqrt: recommended by Breiman manual for random forests * - The defaults of sqrt (classification) and onethird (regression) match the R randomForest * package. - * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf Breiman (2001)]] - * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf Breiman manual for - * random forests]] + * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a> + * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf"> + * Breiman manual for random forests</a> * * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 6ea52ef7f025f839efc0cd1b8e4027b2bb9c69d1..85191d46fd360336a5e021558770b1373da19d34 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.types.StructType */ private[ml] trait CrossValidatorParams extends ValidatorParams { /** - * Param for number of folds for cross validation. Must be >= 2. + * Param for number of folds for cross validation. Must be >= 2. * Default: 3 * * @group param @@ -198,7 +198,7 @@ object CrossValidator extends MLReadable[CrossValidator] { * * @param bestModel The best model selected from k-fold cross validation. * @param avgMetrics Average cross-validation metrics for each paramMap in - * [[CrossValidator.estimatorParamMaps]], in the corresponding order. + * `CrossValidator.estimatorParamMaps`, in the corresponding order. */ @Since("1.2.0") class CrossValidatorModel private[ml] ( diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index e5fa5d53e3fcad260f6cc3d60e31c86baad87d37..5b7e5ec75c8424e931e591bf65fa3bb0141644d4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -163,7 +163,7 @@ trait MLWritable { /** * :: DeveloperApi :: * - * Helper trait for making simple [[Params]] types writable. If a [[Params]] class stores + * Helper trait for making simple `Params` types writable. If a `Params` class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide * a default implementation of writing saved instances of the class. * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle @@ -231,7 +231,7 @@ trait MLReadable[T] { /** * :: DeveloperApi :: * - * Helper trait for making simple [[Params]] types readable. If a [[Params]] class stores + * Helper trait for making simple `Params` types readable. If a `Params` class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide * a default implementation of reading saved instances of the class. * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle @@ -360,7 +360,7 @@ private[ml] object DefaultParamsReader { /** * Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name. - * This can be useful for getting a Param value before an instance of [[Params]] + * This can be useful for getting a Param value before an instance of `Params` * is available. */ def getParamValue(paramName: String): JValue = { @@ -438,7 +438,7 @@ private[ml] object DefaultParamsReader { } /** - * Load a [[Params]] instance from the given path, and return it. + * Load a `Params` instance from the given path, and return it. * This assumes the instance implements [[MLReadable]]. */ def loadParamsInstance[T](path: String, sc: SparkContext): T = { @@ -454,7 +454,7 @@ private[ml] object DefaultParamsReader { private[ml] object MetaAlgorithmReadWrite { /** * Examine the given estimator (which may be a compound estimator) and extract a mapping - * from UIDs to corresponding [[Params]] instances. + * from UIDs to corresponding `Params` instances. */ def getUidMap(instance: Params): Map[String, Params] = { val uidList = getUidMapImpl(instance) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 767d056861a8b7bcc6694764eae57cd4cbf590f2..fa46ba3ace5086324fc2d6108ac29c7f80dc300b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -302,10 +302,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of - * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for - * document classification. By making every vector a 0-1 vector, it can also be used as - * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative. + * This is the Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) which can + * handle all kinds of discrete data. For example, by converting documents into TF-IDF + * vectors, it can be used for document classification. By making every vector a 0-1 vector, + * it can also be used as Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>). + * The input feature values must be nonnegative. */ @Since("0.9.0") class NaiveBayes private ( @@ -402,9 +403,9 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all - * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it - * can be used for document classification. + * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) + * which can handle all kinds of discrete data. For example, by converting documents into + * TF-IDF vectors, it can be used for document classification. * * This version of the method uses a default smoothing parameter of 1.0. * @@ -419,9 +420,9 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all - * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it - * can be used for document classification. + * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) + * which can handle all kinds of discrete data. For example, by converting documents + * into TF-IDF vectors, it can be used for document classification. * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. @@ -435,9 +436,10 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]]) - * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle - * discrete count data and can be called by setting the model type to "multinomial". + * The model type can be set to either Multinomial NB (see <a href="http://tinyurl.com/lsdw6p"> + * here</a>) or Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>). + * The Multinomial NB can handle discrete count data and can be called by setting the model + * type to "multinomial". * For example, it can be used with word counts or TF_IDF vectors of documents. * The Bernoulli model fits presence or absence (0-1) counts. By making every vector a * 0-1 vector and setting the model type to "bernoulli", the fits and predicts as diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index e6b89712e219d73bb3515dbbcdb510993b1d961a..31f51417528b3d35433b540f5143d445a874bef4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -43,13 +43,14 @@ import org.apache.spark.storage.StorageLevel * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if * there are no divisible leaf clusters. * @param maxIterations the max number of k-means iterations to split clusters (default: 20) - * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion - * of points (if < 1.0) of a divisible cluster (default: 1) + * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum + * proportion of points (if < 1.0) of a divisible cluster + * (default: 1) * @param seed a random seed (default: hash value of the class name) * - * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf - * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, - * KDD Workshop on Text Mining, 2000.]] + * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf"> + * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, + * KDD Workshop on Text Mining, 2000.</a> */ @Since("1.6.0") class BisectingKMeans private ( @@ -100,8 +101,8 @@ class BisectingKMeans private ( def getMaxIterations: Int = this.maxIterations /** - * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points - * (if < `1.0`) of a divisible cluster (default: 1). + * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points + * (if < `1.0`) of a divisible cluster (default: 1). */ @Since("1.6.0") def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = { @@ -112,8 +113,8 @@ class BisectingKMeans private ( } /** - * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points - * (if < `1.0`) of a divisible cluster. + * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points + * (if < `1.0`) of a divisible cluster. */ @Since("1.6.0") def getMinDivisibleClusterSize: Double = minDivisibleClusterSize @@ -218,7 +219,7 @@ class BisectingKMeans private ( } /** - * Java-friendly version of [[run()]]. + * Java-friendly version of `run()`. */ def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 8438015ccecea8f44cf334fc6ab0ed0a7bea54a7..6f1ab091b2317e6a9512746215552f6642ff0496 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -71,7 +71,7 @@ class BisectingKMeansModel private[clustering] ( } /** - * Java-friendly version of [[predict()]]. + * Java-friendly version of `predict()`. */ @Since("1.6.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = @@ -95,7 +95,7 @@ class BisectingKMeansModel private[clustering] ( } /** - * Java-friendly version of [[computeCost()]]. + * Java-friendly version of `computeCost()`. */ @Since("1.6.0") def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index 56cdeea5f7a3f9ca6ddb894e0f77f5c58a4fca3f..6873d4277a8db3cbc75dab8bd817de9ac009af62 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -234,7 +234,7 @@ class GaussianMixture private ( } /** - * Java-friendly version of [[run()]] + * Java-friendly version of `run()` */ @Since("1.3.0") def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd) @@ -273,8 +273,8 @@ class GaussianMixture private ( private[clustering] object GaussianMixture { /** - * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when - * d > 25 except for when k is very small. + * Heuristic to distribute the computation of the `MultivariateGaussian`s, approximately when + * d > 25 except for when k is very small. * @param k Number of topics * @param d Number of features */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index c30cc3e2398e41a5ca75f7678356e8d568455fb2..afbe4f978b28612b4fbe9614a784def43656a0e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -80,7 +80,7 @@ class GaussianMixtureModel @Since("1.3.0") ( } /** - * Java-friendly version of [[predict()]] + * Java-friendly version of `predict()` */ @Since("1.4.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index 7c52abdeaac223afe181cb223778a2b463aad7bd..16742bd284e69c5f03b238b6299007146a67bae1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -39,8 +39,8 @@ import org.apache.spark.util.Utils * - Original LDA paper (journal version): * Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. * - * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation - * (Wikipedia)]] + * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation"> + * Latent Dirichlet allocation (Wikipedia)</a> */ @Since("1.3.0") class LDA private ( @@ -113,20 +113,20 @@ class LDA private ( * * If set to a singleton vector Vector(-1), then docConcentration is set automatically. If set to * singleton vector Vector(t) where t != -1, then t is replicated to a vector of length k during - * [[LDAOptimizer.initialize()]]. Otherwise, the [[docConcentration]] vector must be length k. + * `LDAOptimizer.initialize()`. Otherwise, the [[docConcentration]] vector must be length k. * (default = Vector(-1) = automatic) * * Optimizer-specific parameter settings: * - EM * - Currently only supports symmetric distributions, so all values in the vector should be * the same. - * - Values should be > 1.0 + * - Values should be > 1.0 * - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows * from Asuncion et al. (2009), who recommend a +1 adjustment for EM. * - Online - * - Values should be >= 0 + * - Values should be >= 0 * - default = uniformly (1.0 / k), following the implementation from - * [[https://github.com/Blei-Lab/onlineldavb]]. + * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>. */ @Since("1.5.0") def setDocConcentration(docConcentration: Vector): this.type = { @@ -158,13 +158,13 @@ class LDA private ( def getAlpha: Double = getDocConcentration /** - * Alias for [[setDocConcentration()]] + * Alias for `setDocConcentration()` */ @Since("1.5.0") def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) /** - * Alias for [[setDocConcentration()]] + * Alias for `setDocConcentration()` */ @Since("1.3.0") def setAlpha(alpha: Double): this.type = setDocConcentration(alpha) @@ -195,13 +195,13 @@ class LDA private ( * * Optimizer-specific parameter settings: * - EM - * - Value should be > 1.0 + * - Value should be > 1.0 * - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows * Asuncion et al. (2009), who recommend a +1 adjustment for EM. * - Online - * - Value should be >= 0 + * - Value should be >= 0 * - default = (1.0 / k), following the implementation from - * [[https://github.com/Blei-Lab/onlineldavb]]. + * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>. */ @Since("1.3.0") def setTopicConcentration(topicConcentration: Double): this.type = { @@ -321,7 +321,7 @@ class LDA private ( * @param documents RDD of documents, which are term (word) count vectors paired with IDs. * The term count vectors are "bags of words" with a fixed-size vocabulary * (where the vocabulary size is the length of the vector). - * Document IDs must be unique and >= 0. + * Document IDs must be unique and >= 0. * @return Inferred LDA model */ @Since("1.3.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index b5b0e64a2a6c6bdc5f6803967de10d3b984ab2f4..017fbc6feb0d7183e9f25270642310b5c528b270 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -171,7 +171,7 @@ abstract class LDAModel private[clustering] extends Saveable { * The term count vectors are "bags of words" with a fixed-size vocabulary * (where the vocabulary size is the length of the vector). * This must use the same vocabulary (ordering of term counts) as in training. - * Document IDs must be unique and >= 0. + * Document IDs must be unique and >= 0. * @return Estimated topic distribution for each document. * The returned RDD may be zipped with the given RDD, where each returned vector * is a multinomial distribution over topics. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 7365ea1f200da381b120e9cbf926afc8d2a41bb2..9687fc8804e89fe3d8ffaa298f005f9b02d8791a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -563,7 +563,7 @@ private[clustering] object OnlineLDAOptimizer { * * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001) * avoids explicit computation of variational parameter `phi`. - * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]] + * @see <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566">here</a> * * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` - * statistics for updating lambda and `ids` - list of termCounts vector indices. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index c760ddd6ad40b9b81eb05b42df73c7129ee6f7d5..4d3e265455da6e2b50ae13ac39845feedcd9755a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom * Model produced by [[PowerIterationClustering]]. * * @param k number of clusters - * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s + * @param assignments an RDD of clustering `PowerIterationClustering#Assignment`s */ @Since("1.3.0") class PowerIterationClusteringModel @Since("1.3.0") ( @@ -103,9 +103,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode /** * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by - * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very - * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise - * similarity matrix of the data. + * <a href="http://www.icml2010.org/papers/387.pdf">Lin and Cohen</a>. From the abstract: PIC finds + * a very low-dimensional embedding of a dataset using truncated power iteration on a normalized + * pair-wise similarity matrix of the data. * * @param k Number of clusters. * @param maxIterations Maximum number of iterations of the PIC algorithm. @@ -113,7 +113,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode * as vertex properties, or "degree" to use normalized sum similarities. * Default: random. * - * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]] + * @see <a href="http://en.wikipedia.org/wiki/Spectral_clustering"> + * Spectral clustering (Wikipedia)</a> */ @Since("1.3.0") class PowerIterationClustering private[clustering] ( @@ -210,7 +211,7 @@ class PowerIterationClustering private[clustering] ( } /** - * A Java-friendly version of [[PowerIterationClustering.run]]. + * A Java-friendly version of `PowerIterationClustering.run`. */ @Since("1.3.0") def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)]) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index f20ab09bf0b4276da3568b88260d45ee0994bce5..85c37c438d93a1e5e5e304e9cb1213beb964d05b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -39,14 +39,14 @@ import org.apache.spark.util.random.XORShiftRandom * generalized to incorporate forgetfullness (i.e. decay). * The update rule (for each cluster) is: * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * c_t+1 &= [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t] \\ * n_t+t &= n_t * a + m_t * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * Where c_t is the previously estimated centroid for that cluster, * n_t is the number of points assigned to it thus far, x_t is the centroid diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala index 8f777cc35b93f02839234c9797310cd9f1b05ffd..ad99b00a31fd5de20812bb49c201a7653d65a570 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala @@ -74,7 +74,8 @@ class RegressionMetrics @Since("2.0.0") ( /** * Returns the variance explained by regression. * explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2^ / n$ - * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]] + * @see <a href="https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained"> + * Fraction of variance unexplained (Wikipedia)</a> */ @Since("1.2.0") def explainedVariance: Double = { @@ -110,10 +111,11 @@ class RegressionMetrics @Since("2.0.0") ( /** * Returns R^2^, the unadjusted coefficient of determination. - * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]] + * @see <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination"> + * Coefficient of determination (Wikipedia)</a> * In case of regression through the origin, the definition of R^2^ is to be modified. - * @see J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003) - * [[https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf]] + * @see <a href="https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf"> + * J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)</a> */ @Since("1.2.0") def r2: Double = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index 0f7fbe9556c5d447472886738fd01dc8d71ca976..b53386012280d3a1984163e7a4d21f2a7b8e8468 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -147,18 +147,18 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] { /** * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in - * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query - * Recommendation]]. PFP distributes computation in such a way that each worker executes an + * <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query + * Recommendation</a>. PFP distributes computation in such a way that each worker executes an * independent group of mining tasks. The FP-Growth algorithm is described in - * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate - * generation]]. + * <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without + * candidate generation</a>. * * @param minSupport the minimal support level of the frequent pattern, any pattern that appears * more than (minSupport * size-of-the-dataset) times will be output * @param numPartitions number of partitions used by parallel FP-growth * - * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning - * (Wikipedia)]] + * @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning"> + * Association rule learning (Wikipedia)</a> * */ @Since("1.3.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index 7382000791cfb75890660fade3c0b36118a89143..a5641672218dd54db7180c26f5e1364665a39c4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -44,7 +44,8 @@ import org.apache.spark.storage.StorageLevel /** * A parallel PrefixSpan algorithm to mine frequent sequential patterns. * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns - * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]). + * Efficiently by Prefix-Projected Pattern Growth + * (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>). * * @param minSupport the minimal support level of the sequential pattern, any pattern that appears * more than (minSupport * size-of-the-dataset) times will be output @@ -55,8 +56,8 @@ import org.apache.spark.storage.StorageLevel * processing. If a projected database exceeds this size, another * iteration of distributed prefix growth is run. * - * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining - * (Wikipedia)]] + * @see <a href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential Pattern Mining + * (Wikipedia)</a> */ @Since("1.5.0") class PrefixSpan private ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 03866753b50eee4ef4b1ebc47f3dc1330f8b53b7..9e75217410d36a7057eeafffe0234da4022f60aa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -385,10 +385,10 @@ class BlockMatrix @Since("1.3.0") ( /** * Adds the given block matrix `other` to `this` block matrix: `this + other`. * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock` - * values. If one of the blocks that are being added are instances of [[SparseMatrix]], - * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being added - * to a [[DenseMatrix]]. If two dense matrices are added, the output will also be a - * [[DenseMatrix]]. + * values. If one of the blocks that are being added are instances of `SparseMatrix`, + * the resulting sub matrix will also be a `SparseMatrix`, even if it is being added + * to a `DenseMatrix`. If two dense matrices are added, the output will also be a + * `DenseMatrix`. */ @Since("1.3.0") def add(other: BlockMatrix): BlockMatrix = @@ -397,10 +397,10 @@ class BlockMatrix @Since("1.3.0") ( /** * Subtracts the given block matrix `other` from `this` block matrix: `this - other`. * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock` - * values. If one of the blocks that are being subtracted are instances of [[SparseMatrix]], - * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being subtracted - * from a [[DenseMatrix]]. If two dense matrices are subtracted, the output will also be a - * [[DenseMatrix]]. + * values. If one of the blocks that are being subtracted are instances of `SparseMatrix`, + * the resulting sub matrix will also be a `SparseMatrix`, even if it is being subtracted + * from a `DenseMatrix`. If two dense matrices are subtracted, the output will also be a + * `DenseMatrix`. */ @Since("2.0.0") def subtract(other: BlockMatrix): BlockMatrix = @@ -447,8 +447,8 @@ class BlockMatrix @Since("1.3.0") ( /** * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock` * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains - * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output - * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause + * `SparseMatrix`, they will have to be converted to a `DenseMatrix`. The output + * [[BlockMatrix]] will only consist of blocks of `DenseMatrix`. This may cause * some performance issues until support for multiplying two sparse matrices is added. * * @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala index 008b03d1cc33407e597b26f36f35ad6390b4cd8f..d2c5b14a5b128c00223bc006a0f16f3ecd20858d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala @@ -101,14 +101,14 @@ class CoordinateMatrix @Since("1.0.0") ( toIndexedRowMatrix().toRowMatrix() } - /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */ @Since("1.3.0") def toBlockMatrix(): BlockMatrix = { toBlockMatrix(1024, 1024) } /** - * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]]. + * Converts to BlockMatrix. Creates blocks of `SparseMatrix`. * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have * a smaller value. Must be an integer value greater than 0. * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index 809906a158337a236a0c8c3b36496ef9463b4089..590e959daa1f4fb521062f9a3a8f48b9c12df507 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -90,14 +90,14 @@ class IndexedRowMatrix @Since("1.0.0") ( new RowMatrix(rows.map(_.vector), 0L, nCols) } - /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */ @Since("1.3.0") def toBlockMatrix(): BlockMatrix = { toBlockMatrix(1024, 1024) } /** - * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]]. + * Converts to BlockMatrix. Creates blocks of `SparseMatrix`. * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have * a smaller value. Must be an integer value greater than 0. * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 4b120332ab8d85611e08d7eff2a7d489ba17ced9..78a8810052aefeb07eb1a08f3616197f5080c249 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -531,7 +531,7 @@ class RowMatrix @Since("1.0.0") ( * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape. * Reference: * Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce - * architectures" ([[http://dx.doi.org/10.1145/1996092.1996103]]) + * architectures" (see <a href="http://dx.doi.org/10.1145/1996092.1996103">here</a>) * * @param computeQ whether to computeQ * @return QRDecomposition(Q, R), Q = null if computeQ = false. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala index c49e72646bf13e5cf5115bc9377746af001c9a59..0efce3c76f15afa953f19960a0a2f0c644ce24bf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala @@ -67,14 +67,14 @@ abstract class Gradient extends Serializable { * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of * multinomial logistic regression model. A simple calculation shows that * - * <p><blockquote> + * <blockquote> * $$ * P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\ * P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\ * ...\\ * P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\ * $$ - * </blockquote></p> + * </blockquote> * * for K classes multiclass classification problem. * @@ -83,7 +83,7 @@ abstract class Gradient extends Serializable { * will be (K-1) * N. * * As a result, the loss of objective function for a single instance of data can be written as - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * l(w, x) &= -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) \\ @@ -91,7 +91,7 @@ abstract class Gradient extends Serializable { * &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where $\alpha(i) = 1$ if $i \ne 0$, and * $\alpha(i) = 0$ if $i == 0$, @@ -100,7 +100,7 @@ abstract class Gradient extends Serializable { * For optimization, we have to calculate the first derivative of the loss function, and * a simple calculation shows that * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * \frac{\partial l(w, x)}{\partial w_{ij}} &= @@ -108,7 +108,7 @@ abstract class Gradient extends Serializable { * &= multiplier_i * x_j * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where $\delta_{i, j} = 1$ if $i == j$, * $\delta_{i, j} = 0$ if $i != j$, and @@ -118,12 +118,12 @@ abstract class Gradient extends Serializable { * If any of margins is larger than 709.78, the numerical computation of multiplier and loss * function will be suffered from arithmetic overflow. This issue occurs when there are outliers * in data which are far away from hyperplane, and this will cause the failing of training once - * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0. + * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0. * - * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be + * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be * easily rewritten into the following equivalent numerically stable formula. * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\ @@ -132,7 +132,7 @@ abstract class Gradient extends Serializable { * &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1} * \end{align} * $$ - * </blockquote></p> + * </blockquote> * where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$. * @@ -141,7 +141,7 @@ abstract class Gradient extends Serializable { * * For multiplier, similar trick can be applied as the following, * - * <p><blockquote> + * <blockquote> * $$ * \begin{align} * multiplier @@ -150,7 +150,7 @@ abstract class Gradient extends Serializable { * &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1}) * \end{align} * $$ - * </blockquote></p> + * </blockquote> * * where each term in $\exp$ is also smaller than zero, so overflow is not a concern. * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 123e0bb3e607ad81823302809b758368e306b820..67da88e804da2d0c004c2d8cbede9ffef50d379f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -88,10 +88,10 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va * convergenceTol is a condition which decides iteration termination. * The end of iteration is decided based on below logic. * - * - If the norm of the new solution vector is >1, the diff of solution vectors + * - If the norm of the new solution vector is >1, the diff of solution vectors * is compared to relative tolerance which means normalizing by the norm of * the new solution vector. - * - If the norm of the new solution vector is <=1, the diff of solution vectors + * - If the norm of the new solution vector is <=1, the diff of solution vectors * is compared to absolute tolerance which is not normalizing. * * Must be between 0.0 and 1.0 inclusively. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index e49363c2c64d93d46d53a5e4a5e164b83be6bc4d..6232ff30a747ed475f43bb06cc18b6d104115556 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * Class used to solve an optimization problem using Limited-memory BFGS. - * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]] + * Reference: <a href="http://en.wikipedia.org/wiki/Limited-memory_BFGS"> + * Wikipedia on Limited-memory BFGS</a> * @param gradient Gradient function to be used. * @param updater Updater to be used to update weights after every iteration. */ @@ -48,8 +49,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater) * Set the number of corrections used in the LBFGS update. Default 10. * Values of numCorrections less than 3 are not recommended; large values * of numCorrections will result in excessive computing time. - * 3 < numCorrections < 10 is recommended. - * Restriction: numCorrections > 0 + * 3 < numCorrections < 10 is recommended. + * Restriction: numCorrections > 0 */ def setNumCorrections(corrections: Int): this.type = { require(corrections > 0, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala index 64d52bae009072796aac604b594cc09c074b8e70..b7c9fcfbfe60f2e21bedbbac5c99bcde60cbf692 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala @@ -54,7 +54,7 @@ private[spark] object NNLS { * * We solve the problem * min_x 1/2 x^T ata x^T - x^T atb - * subject to x >= 0 + * subject to x >= 0 * * The method used is similar to one described by Polyak (B. T. Polyak, The conjugate gradient * method in extremal problems, Zh. Vychisl. Mat. Mat. Fiz. 9(4)(1969), pp. 94-112) for bound- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala index 67d484575db528f4660e8ed761cdcafcff3dabb1..aa7dd1aaa60fedd1b7c6caf6c6e77e8536ed0013 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala @@ -95,9 +95,9 @@ class SimpleUpdater extends Updater { * The corresponding proximal operator for the L1 norm is the soft-thresholding * function. That is, each weight component is shrunk towards 0 by shrinkageVal. * - * If w > shrinkageVal, set weight component to w-shrinkageVal. - * If w < -shrinkageVal, set weight component to w+shrinkageVal. - * If -shrinkageVal < w < shrinkageVal, set weight component to 0. + * If w > shrinkageVal, set weight component to w-shrinkageVal. + * If w < -shrinkageVal, set weight component to w+shrinkageVal. + * If -shrinkageVal < w < shrinkageVal, set weight component to 0. * * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal) */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/package.scala index 9810b6f66806404169fab1192d80e921eb5923ec..8323afcb6a8338266408009c3f66b97c74cd78ea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/package.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/package.scala @@ -32,7 +32,7 @@ package org.apache.spark * to reach feature parity with the RDD-based APIs. * And once we reach feature parity, this package will be deprecated. * - * @see [[https://issues.apache.org/jira/browse/SPARK-4591 SPARK-4591]] to track the progress of - * feature parity + * @see <a href="https://issues.apache.org/jira/browse/SPARK-4591">SPARK-4591</a> to track + * the progress of feature parity */ package object mllib diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala index 005119616f0636f4bc56cf180c7a22c712349a39..32e6ecf6308e07fbc2cdc3a2ebc6aa2a32e88b28 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala @@ -48,7 +48,7 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable { } /** - * [[sliding(Int, Int)*]] with step = 1. + * `sliding(Int, Int)*` with step = 1. */ def sliding(windowSize: Int): RDD[Array[T]] = sliding(windowSize, 1) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index cc9ee15738ad6509fe210277de54461fc89fc8bc..d215885797176cccd0fd5ae52c94b6b393978d5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -54,11 +54,12 @@ case class Rating @Since("0.8.0") ( * * For implicit preference data, the algorithm used is based on * "Collaborative Filtering for Implicit Feedback Datasets", available at - * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here. + * <a href="http://dx.doi.org/10.1109/ICDM.2008.22">here</a>, adapted for the blocked approach + * used here. * * Essentially instead of finding the low-rank approximations to the rating matrix `R`, * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if - * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of + * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of * indicated user * preferences rather than explicit ratings given to items. */ @@ -280,7 +281,7 @@ class ALS private ( } /** - * Java-friendly version of [[ALS.run]]. + * Java-friendly version of `ALS.run`. */ @Since("1.3.0") def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala index 24e4dcccc843f59220f400cfe231f22250ce2299..23045fa2b6863e00e348082e0e1d3f9b84bc0e55 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala @@ -146,7 +146,7 @@ class MatrixFactorizationModel @Since("0.8.0") ( } /** - * Java-friendly version of [[MatrixFactorizationModel.predict]]. + * Java-friendly version of `MatrixFactorizationModel.predict`. */ @Since("1.2.0") def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = { @@ -195,7 +195,7 @@ class MatrixFactorizationModel @Since("0.8.0") ( * - human-readable (JSON) model metadata to path/metadata/ * - Parquet formatted data to path/data/ * - * The model may be loaded using [[Loader.load]]. + * The model may be loaded using `Loader.load`. * * @param sc Spark context used to save model data. * @param path Path specifying the directory in which to save this model. @@ -320,7 +320,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] { /** * Load a model from the given path. * - * The model should have been saved by [[Saveable.save]]. + * The model should have been saved by `Saveable.save`. * * @param sc Spark context used for loading model files. * @param path Path specifying the directory to which the model was saved. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 377326f8739b758f9b196079f48269f3ac3063f7..36894d52346afe41cd4228d4a21fd374859186ad 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -238,23 +238,22 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * Sequential PAV implementation based on: * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. * "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. - * Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]] + * Available from <a href="http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf">here</a> * * Sequential PAV parallelization based on: * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. * "An approach to parallelizing isotonic regression." * Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. - * Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]] + * Available from <a href="http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf">here</a> * - * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] + * @see <a href="http://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression + * (Wikipedia)</a> */ @Since("1.3.0") class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** * Constructs IsotonicRegression instance with default parameter isotonic = true. - * - * @return New instance of IsotonicRegression. */ @Since("1.3.0") def this() = this(true) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 7a2a7a35a91cdf9dadecdc47f6375653961007c8..7dc0c459ec03207470a68241fd60621d2a613f4e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -30,12 +30,15 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors} * the corresponding joint dataset. * * A numerically stable algorithm is implemented to compute the mean and variance of instances: - * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]] + * Reference: <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance"> + * variance-wiki</a> * Zero elements (including explicit zero values) are skipped when calling add(), * to have time complexity O(nnz) instead of O(n) for each column. * * For weighted instances, the unbiased estimation of variance is defined by the reliability - * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]]. + * weights: + * see <a href="https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights"> + * Reliability weights (Wikipedia)</a>. */ @Since("1.1.0") @DeveloperApi diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index 925fdf4d7e7bc2d1991dc06c158de6464b1ad961..7ba9b292969e7853fbb8e6d19f5608f148a578d1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala @@ -88,7 +88,7 @@ object Statistics { def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y) /** - * Java-friendly version of [[corr()]] + * Java-friendly version of `corr()` */ @Since("1.4.1") def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double = @@ -112,7 +112,7 @@ object Statistics { def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method) /** - * Java-friendly version of [[corr()]] + * Java-friendly version of `corr()` */ @Since("1.4.1") def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double = @@ -176,7 +176,7 @@ object Statistics { ChiSqTest.chiSquaredFeatures(data) } - /** Java-friendly version of [[chiSqTest()]] */ + /** Java-friendly version of `chiSqTest()` */ @Since("1.5.0") def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd) @@ -186,7 +186,8 @@ object Statistics { * distribution of the sample data and the theoretical distribution we can provide a test for the * the null hypothesis that the sample data comes from that theoretical distribution. * For more information on KS Test: - * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]] + * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test"> + * Kolmogorov-Smirnov test (Wikipedia)</a> * * @param data an `RDD[Double]` containing the sample of data to test * @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value @@ -217,7 +218,7 @@ object Statistics { KolmogorovSmirnovTest.testOneSample(data, distName, params: _*) } - /** Java-friendly version of [[kolmogorovSmirnovTest()]] */ + /** Java-friendly version of `kolmogorovSmirnovTest()` */ @Since("1.5.0") @varargs def kolmogorovSmirnovTest( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala index 39c3644450d6d321a3c7e67069fb9a55e0845391..4cf662e036346c3ab29f3d834dcf924c7b018525 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala @@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.MLUtils * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In * the event that the covariance matrix is singular, the density will be computed in a * reduced dimensional subspace under which the distribution is supported. - * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]]) + * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case"> + * Degenerate case in Multivariate normal distribution (Wikipedia)</a>) * * @param mu The mean vector of the distribution * @param sigma The covariance matrix of the distribution diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index ece1e41d986d0ea469138810f2854a6906d566a5..cdeef16135015f3935b6ae37674bcebbdff619ea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD /** * A class that implements - * [[http://en.wikipedia.org/wiki/Gradient_boosting Stochastic Gradient Boosting]] + * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Stochastic Gradient Boosting</a> * for regression and binary classification. * * The implementation is based upon: diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 14f11ce51b878acdd8f670c90b46faafed35d236..428af214060927d2f36f6085da4f66866e8fc9bc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -36,7 +36,7 @@ import org.apache.spark.util.Utils /** - * A class that implements a [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] + * A class that implements a <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> * learning algorithm for classification and regression. * It supports both continuous and categorical features. * @@ -46,9 +46,9 @@ import org.apache.spark.util.Utils * - The defaults of sqrt (classification) and onethird (regression) match the R randomForest * package. * - * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf Breiman (2001)]] - * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf Breiman manual for - * random forests]] + * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a> + * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf"> + * Breiman manual for random forests</a> * @param strategy The configuration parameters for the random forest algorithm which specify * the type of random forest (classification or regression), feature type * (continuous, categorical), depth of the tree, quantile calculation strategy, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala index 5cef9d0631b591d1b901b80a164b4b72e73beb10..be2704df3444f38b75113b485a41907e8a56bdbb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala @@ -25,7 +25,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType * Split applied to a feature * @param feature feature index * @param threshold Threshold for continuous feature. - * Split left if feature <= threshold, else right. + * Split left if feature <= threshold, else right. * @param featureType type of feature -- categorical or continuous * @param categories Split left if categorical feature value is in this set, else right. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index e96c2bc6edfc361341af19f3399128fabcea9514..6bb3271aacb44d2476fba2c8b28a8b5085475ff4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -213,7 +213,7 @@ object MLUtils extends Logging { } /** - * Version of [[kFold()]] taking a Long seed. + * Version of `kFold()` taking a Long seed. */ @Since("2.0.0") def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Long): Array[(RDD[T], RDD[T])] = { @@ -262,7 +262,7 @@ object MLUtils extends Logging { * @param dataset input dataset * @param cols a list of vector columns to be converted. New vector columns will be ignored. If * unspecified, all old vector columns will be converted except nested ones. - * @return the input [[DataFrame]] with old vector columns converted to the new vector type + * @return the input `DataFrame` with old vector columns converted to the new vector type */ @Since("2.0.0") @varargs @@ -314,7 +314,7 @@ object MLUtils extends Logging { * @param dataset input dataset * @param cols a list of vector columns to be converted. Old vector columns will be ignored. If * unspecified, all new vector columns will be converted except nested ones. - * @return the input [[DataFrame]] with new vector columns converted to the old vector type + * @return the input `DataFrame` with new vector columns converted to the old vector type */ @Since("2.0.0") @varargs @@ -366,7 +366,7 @@ object MLUtils extends Logging { * @param dataset input dataset * @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If * unspecified, all old matrix columns will be converted except nested ones. - * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type + * @return the input `DataFrame` with old matrix columns converted to the new matrix type */ @Since("2.0.0") @varargs @@ -416,7 +416,7 @@ object MLUtils extends Logging { * @param dataset input dataset * @param cols a list of matrix columns to be converted. Old matrix columns will be ignored. If * unspecified, all new matrix columns will be converted except nested ones. - * @return the input [[DataFrame]] with new matrix columns converted to the old matrix type + * @return the input `DataFrame` with new matrix columns converted to the old matrix type */ @Since("2.0.0") @varargs diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala index c881c8ea50c09ef8ae0a5e2324ec158f63d7bc5e..da0eb04764c5792182d6f327591a2145b5093989 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala @@ -72,7 +72,7 @@ trait Loader[M <: Saveable] { /** * Load a model from the given path. * - * The model should have been saved by [[Saveable.save]]. + * The model should have been saved by `Saveable.save`. * * @param sc Spark context used for loading model files. * @param path Path specifying the directory to which the model was saved. diff --git a/pom.xml b/pom.xml index 7c0b0b59dc62be5a1c3848a77028bc9e7d7305c2..5c417d2b357271c2849332d0227bad44547d3979 100644 --- a/pom.xml +++ b/pom.xml @@ -2495,6 +2495,18 @@ <name>tparam</name> <placement>X</placement> </tag> + <tag> + <name>constructor</name> + <placement>X</placement> + </tag> + <tag> + <name>todo</name> + <placement>X</placement> + </tag> + <tag> + <name>groupname</name> + <placement>X</placement> + </tag> </tags> </configuration> </plugin> diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 429a163d22a6d6c99586ff1f54bc05ad93a91648..e3fbe0379fb7b6c7635d41c3e3420682d8150a8d 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -745,7 +745,10 @@ object Unidoc { "-tag", """example:a:Example\:""", "-tag", """note:a:Note\:""", "-tag", "group:X", - "-tag", "tparam:X" + "-tag", "tparam:X", + "-tag", "constructor:X", + "-tag", "todo:X", + "-tag", "groupname:X" ), // Use GitHub repository for Scaladoc source links diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 65f91429648c14ba9da372e669e625251ffa1126..a821d2ca345793626791332aacc414b6ae0b690d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -343,7 +343,7 @@ trait Row extends Serializable { } /** - * Returns a Map(name -> value) for the requested fieldNames + * Returns a Map(name -> value) for the requested fieldNames * For primitive types if value is null it returns 'zero value' specific for primitive * ie. 0 for Int - use isNullAt to ensure that value is not null * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala index 302054708ccb5cd694f7a3ad2979193ff1d6788f..1a93f4590331b621dc93bf8e957cea1823c4e8b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala @@ -37,8 +37,8 @@ import org.apache.spark.sql.types._ * - Xiangrui Meng. "Simpler Online Updates for Arbitrary-Order Central Moments." * 2015. http://arxiv.org/abs/1510.04923 * - * @see [[https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - * Algorithms for calculating variance (Wikipedia)]] + * @see <a href="https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance"> + * Algorithms for calculating variance (Wikipedia)</a> * * @param child to compute central moments of. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala index a4a358a242c700262314031007fb89ac7291bb1d..02c8318b4d413295ba4e4e77e0a6dd263dce09fb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils /** * The data type representing `Array[Byte]` values. - * Please use the singleton [[DataTypes.BinaryType]]. + * Please use the singleton `DataTypes.BinaryType`. */ @InterfaceStability.Stable class BinaryType private() extends AtomicType { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala index 059f89f9cda3222c5338c95376c30405f844ffae..cee78f4b4ac1a0a6ac8a9bd274ead2cd68fbe52a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock /** - * The data type representing `Boolean` values. Please use the singleton [[DataTypes.BooleanType]]. + * The data type representing `Boolean` values. Please use the singleton `DataTypes.BooleanType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala index bc6251f024e58820a7927610633b9a5240ac7c59..b1dd5eda36bd6670dd8b0629f2c37d999a28abe1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.ScalaReflectionLock /** - * The data type representing `Byte` values. Please use the singleton [[DataTypes.ByteType]]. + * The data type representing `Byte` values. Please use the singleton `DataTypes.ByteType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala index 21f3497ba06fba5164ad7dd335e78b7dbb9e73df..2342036a57460bf3bbd86c061601c2f4e0143ce5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala @@ -23,7 +23,7 @@ import org.apache.spark.annotation.InterfaceStability * The data type representing calendar time intervals. The calendar time interval is stored * internally in two components: number of months the number of microseconds. * - * Please use the singleton [[DataTypes.CalendarIntervalType]]. + * Please use the singleton `DataTypes.CalendarIntervalType`. * * @note Calendar intervals are not comparable. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala index 8d0ecc051f4ce59b393c19c40071696eaeef907b..0c0574b845536487938f78ecb6d52ca12ae7e5ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock /** * A date type, supporting "0001-01-01" through "9999-12-31". * - * Please use the singleton [[DataTypes.DateType]]. + * Please use the singleton `DataTypes.DateType`. * * Internally, this is represented as the number of days from 1970-01-01. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala index d7ca0cbeedcd3f06b9abb296122afc7066d32aea..cecad3b7b4c0a104a7e204e1efeadfd48124e277 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression * * The default precision and scale is (10, 0). * - * Please use [[DataTypes.createDecimalType()]] to create a specific instance. + * Please use `DataTypes.createDecimalType()` to create a specific instance. * * @since 1.3.0 */ @@ -92,7 +92,7 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType { } /** - * The default size of a value of the DecimalType is 8 bytes (precision <= 18) or 16 bytes. + * The default size of a value of the DecimalType is 8 bytes (precision <= 18) or 16 bytes. */ override def defaultSize: Int = if (precision <= Decimal.MAX_LONG_DIGITS) 8 else 16 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala index c21ac0e43eee0c7de4eed164ab69ccfc003c8488..400f7aed6ae72a21740241eecd00aac218d956bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock import org.apache.spark.util.Utils /** - * The data type representing `Double` values. Please use the singleton [[DataTypes.DoubleType]]. + * The data type representing `Double` values. Please use the singleton `DataTypes.DoubleType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala index c5bf8883bad9309c0e55d15e1a210cac08f0fd0f..b9812b236d575973f4ac829d125b53625b61c9af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock import org.apache.spark.util.Utils /** - * The data type representing `Float` values. Please use the singleton [[DataTypes.FloatType]]. + * The data type representing `Float` values. Please use the singleton `DataTypes.FloatType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala index 724e59c0bcbf47b6cc83235206cb66c9e4ab27fb..dca612ecbfed9fcc6422cadd45a50484ddb287aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock /** - * The data type representing `Int` values. Please use the singleton [[DataTypes.IntegerType]]. + * The data type representing `Int` values. Please use the singleton `DataTypes.IntegerType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala index 42285a9d0aa2960b14a51357d1632e1ee86a241d..396c3355701c5068909f1ef20ea7c29b708f7faf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.ScalaReflectionLock /** - * The data type representing `Long` values. Please use the singleton [[DataTypes.LongType]]. + * The data type representing `Long` values. Please use the singleton `DataTypes.LongType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index 3a32aa43d1c3a78af45d596ee35ee3a068768565..fbf3a617862512b56905d14cb6be520a67d3b12e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -25,7 +25,7 @@ import org.apache.spark.annotation.InterfaceStability /** * The data type for Maps. Keys in a map are not allowed to have `null` values. * - * Please use [[DataTypes.createMapType()]] to create a specific instance. + * Please use `DataTypes.createMapType()` to create a specific instance. * * @param keyType The data type of map keys. * @param valueType The data type of map values. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala index bdf9a819d007b899dc539036e5a85f796ab3dd10..494225b47a270affd99286b051a853e61d1c6f63 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.InterfaceStability /** - * The data type representing `NULL` values. Please use the singleton [[DataTypes.NullType]]. + * The data type representing `NULL` values. Please use the singleton `DataTypes.NullType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala index 3fee299d578ccf3ef25c4889c6c19e562fca73ca..1410d5ba0e0b0170df1224dc5098e631072c72b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.ScalaReflectionLock /** - * The data type representing `Short` values. Please use the singleton [[DataTypes.ShortType]]. + * The data type representing `Short` values. Please use the singleton `DataTypes.ShortType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala index 5d5a6f52a305bd3ddffbce92b2a96e347276a95c..d1c0da3479d765a36098443ca762fac63efedcb4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock import org.apache.spark.unsafe.types.UTF8String /** - * The data type representing `String` values. Please use the singleton [[DataTypes.StringType]]. + * The data type representing `String` values. Please use the singleton `DataTypes.StringType`. * * @since 1.3.0 */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala index 4540d8358acaddbb1b4928d0adc46ef9c5ff3fca..2875995420053135272c488a88d4ff82e1aca520 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock /** * The data type representing `java.sql.Timestamp` values. - * Please use the singleton [[DataTypes.TimestampType]]. + * Please use the singleton `DataTypes.TimestampType`. * * @since 1.3.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index a77937efd7e151e1ddc908d3eb9683c0418fc9bf..5be9a99369997ed867fa8a6af0c133f66cd5431c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -239,8 +239,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) - * and returns the result as a [[DataFrame]]. + * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or + * newline-delimited JSON</a>) and returns the result as a [[DataFrame]]. * See the documentation on the overloaded `json()` method with varargs for more details. * * @since 1.4.0 @@ -251,8 +251,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) - * and returns the result as a [[DataFrame]]. + * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or + * newline-delimited JSON</a>) and returns the result as a [[DataFrame]]. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. @@ -297,8 +297,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(paths: String*): DataFrame = format("json").load(paths : _*) /** - * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format - * or newline-delimited JSON]]) and returns the result as a [[DataFrame]]. + * Loads a `JavaRDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON + * Lines text format or newline-delimited JSON</a>) and returns the result as + * a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. @@ -309,8 +310,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd) /** - * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or - * newline-delimited JSON]]) and returns the result as a [[DataFrame]]. + * Loads an `RDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON Lines + * text format or newline-delimited JSON</a>) and returns the result as a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 6335fc4579a284ec02ffb7edaa129fdd97e7661e..a9a861c4635b2c30b396a9fe23e4bd9b316e6923 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -48,8 +48,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * * This method implements a variation of the Greenwald-Khanna algorithm (with some speed * optimizations). - * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient - * Online Computation of Quantile Summaries]] by Greenwald and Khanna. + * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670"> + * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna. * * @param col the name of the numerical column * @param probabilities a list of quantile probabilities @@ -184,7 +184,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Finding frequent items for columns, possibly with false positives. Using the * frequent element count algorithm described in - * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, + * Schenker, and Papadimitriou. * The `support` should be greater than 1e-4. * * This function is meant for exploratory data analysis, as we make no guarantee about the @@ -230,7 +231,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Finding frequent items for columns, possibly with false positives. Using the * frequent element count algorithm described in - * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, + * Schenker, and Papadimitriou. * Uses a `default` support of 1%. * * This function is meant for exploratory data analysis, as we make no guarantee about the @@ -248,7 +250,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the * frequent element count algorithm described in - * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker, + * and Papadimitriou. * * This function is meant for exploratory data analysis, as we make no guarantee about the * backward compatibility of the schema of the resulting [[DataFrame]]. @@ -291,7 +294,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the * frequent element count algorithm described in - * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker, + * and Papadimitriou. * Uses a `default` support of 1%. * * This function is meant for exploratory data analysis, as we make no guarantee about the diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 15281f24fa6285072a0cb5948e0304021478f952..2d863422fbabe0d8c33b422323c3fe54a5f3908a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -442,8 +442,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text - * format or newline-delimited JSON]]) at the specified path. + * Saves the content of the [[DataFrame]] in JSON format (<a href="http://jsonlines.org/"> + * JSON Lines text format or newline-delimited JSON</a>) at the specified path. * This is equivalent to: * {{{ * format("json").save(path) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 2fae93651b344be6315a0a3bd32946d605b12f8a..858fa4c7609b6c29e2bdbed485d48dfd7ea242d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -172,7 +172,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) def experimental: ExperimentalMethods = sparkSession.experimental /** - * Returns a [[DataFrame]] with no rows or columns. + * Returns a `DataFrame` with no rows or columns. * * @group basic * @since 1.3.0 @@ -254,7 +254,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: * (Scala-specific) Implicit methods available in Scala for converting - * common Scala objects into [[DataFrame]]s. + * common Scala objects into `DataFrame`s. * * {{{ * val sqlContext = new SQLContext(sc) @@ -298,7 +298,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]]. + * Convert a [[BaseRelation]] created for external data sources into a `DataFrame`. * * @group dataframes * @since 1.3.0 @@ -309,7 +309,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from an [[RDD]] containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided RDD matches * the provided schema. Otherwise, there will be runtime exception. * Example: @@ -438,7 +438,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from a [[JavaRDD]] containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided RDD matches * the provided schema. Otherwise, there will be runtime exception. * @@ -453,7 +453,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided List matches * the provided schema. Otherwise, there will be runtime exception. * @@ -504,7 +504,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a - * [[DataFrame]]. + * `DataFrame`. * {{{ * sqlContext.read.parquet("/path/to/file.parquet") * sqlContext.read.schema(schema).json("/path/to/file.json") @@ -518,7 +518,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: - * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]]. + * Returns a [[DataStreamReader]] that can be used to read streaming data in as a `DataFrame`. * {{{ * sparkSession.readStream.parquet("/path/to/directory/of/parquet/files") * sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files") @@ -617,7 +617,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Registers the given [[DataFrame]] as a temporary table in the catalog. Temporary tables exist + * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist * only during the lifetime of this instance of SQLContext. */ private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = { @@ -638,7 +638,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: - * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements + * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements * in a range from 0 to `end` (exclusive) with step value 1. * * @since 1.4.1 @@ -650,7 +650,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: - * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements + * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements * in a range from `start` to `end` (exclusive) with step value 1. * * @since 1.4.0 @@ -662,7 +662,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: - * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements + * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements * in a range from `start` to `end` (exclusive) with a step value. * * @since 2.0.0 @@ -676,7 +676,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * :: Experimental :: - * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements + * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements * in an range from `start` to `end` (exclusive) with an step value, with partition number * specified. * @@ -690,7 +690,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is + * Executes a SQL query using Spark, returning the result as a `DataFrame`. The dialect that is * used for SQL parsing can be configured with 'spark.sql.dialect'. * * @group basic @@ -699,7 +699,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) def sql(sqlText: String): DataFrame = sparkSession.sql(sqlText) /** - * Returns the specified table as a [[DataFrame]]. + * Returns the specified table as a `DataFrame`. * * @group ddl_ops * @since 1.3.0 @@ -709,7 +709,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Returns a [[DataFrame]] containing names of existing tables in the current database. + * Returns a `DataFrame` containing names of existing tables in the current database. * The returned DataFrame has two columns, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * @@ -721,7 +721,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Returns a [[DataFrame]] containing names of existing tables in the given database. + * Returns a `DataFrame` containing names of existing tables in the given database. * The returned DataFrame has two columns, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * @@ -799,8 +799,8 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty - * [[DataFrame]] if no paths are passed in. + * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty + * `DataFrame` if no paths are passed in. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().parquet()`. @@ -816,7 +816,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Loads a JSON file (one object per line), returning the result as a [[DataFrame]]. + * Loads a JSON file (one object per line), returning the result as a `DataFrame`. * It goes through the entire dataset once to determine the schema. * * @group specificdata @@ -829,7 +829,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads a JSON file (one object per line) and applies the given schema, - * returning the result as a [[DataFrame]]. + * returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. @@ -850,7 +850,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * [[DataFrame]]. + * `DataFrame`. * It goes through the entire dataset once to determine the schema. * * @group specificdata @@ -861,7 +861,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * [[DataFrame]]. + * `DataFrame`. * It goes through the entire dataset once to determine the schema. * * @group specificdata @@ -872,7 +872,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema, - * returning the result as a [[DataFrame]]. + * returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. @@ -884,7 +884,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given - * schema, returning the result as a [[DataFrame]]. + * schema, returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. @@ -896,7 +896,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads an RDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a [[DataFrame]]. + * schema, returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. @@ -908,7 +908,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a [[DataFrame]]. + * schema, returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. @@ -995,7 +995,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table. * * @group specificdata @@ -1007,7 +1007,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table. Partitions of the table will be retrieved in parallel based on the parameters * passed to this function. * @@ -1031,10 +1031,10 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table. The theParts parameter gives a list expressions * suitable for inclusion in WHERE clauses; each one defines one partition - * of the [[DataFrame]]. + * of the `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().jdbc()`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index b9dbfcf7734c3c0f7fdb2e822a386e8760b994fe..cdb755edc79a138f788586c41a45e2291b57d2a4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -69,7 +69,8 @@ object FrequentItems extends Logging { /** * Finding frequent items for columns, possibly with false positives. Using the * frequent element count algorithm described in - * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker, + * and Papadimitriou. * The `support` should be greater than 1e-4. * For Internal use only. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index c02b15498748f07cc159d5bdd7a7e6d4aa9aa1e3..2b2e706125ede62a3074faf6e6e4457489d69fa8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -41,8 +41,8 @@ object StatFunctions extends Logging { * * This method implements a variation of the Greenwald-Khanna algorithm (with some speed * optimizations). - * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient - * Online Computation of Quantile Summaries]] by Greenwald and Khanna. + * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670"> + * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna. * * @param df the dataframe * @param cols numerical columns of the dataframe diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala index eea98414003ba70ef1a1fcbfc2f79abad6503ff4..058c38c8cb8f41c29c1670a30a095c021fe67ded 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression /** * :: Experimental :: - * A base class for user-defined aggregations, which can be used in [[Dataset]] operations to take + * A base class for user-defined aggregations, which can be used in `Dataset` operations to take * all of the elements of a group and reduce them to a single value. * * For example, the following aggregator extracts an `int` from a specific class and adds them up: @@ -80,19 +80,19 @@ abstract class Aggregator[-IN, BUF, OUT] extends Serializable { def finish(reduction: BUF): OUT /** - * Specifies the [[Encoder]] for the intermediate value type. + * Specifies the `Encoder` for the intermediate value type. * @since 2.0.0 */ def bufferEncoder: Encoder[BUF] /** - * Specifies the [[Encoder]] for the final ouput value type. + * Specifies the `Encoder` for the final ouput value type. * @since 2.0.0 */ def outputEncoder: Encoder[OUT] /** - * Returns this `Aggregator` as a [[TypedColumn]] that can be used in [[Dataset]]. + * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset`. * operations. * @since 1.6.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index 36dd5f78ac1377fa7aec4e7f6b371744762843af..b13fe7016092c5aa5192e7c18193ab7bdec333fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.functions import org.apache.spark.sql.types.DataType /** - * A user-defined function. To create one, use the `udf` functions in [[functions]]. + * A user-defined function. To create one, use the `udf` functions in `functions`. * * As an example: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala index 327bc379d4132e0a6c1cf1bf185c13d7486856df..f3cf3052ea3ea6a901500c6cc3f0f973190e483f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala @@ -117,8 +117,8 @@ object Window { * "current row", while "-1" means the row before the current row, and "5" means the fifth row * after the current row. * - * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]], - * and [[Window.currentRow]] to specify special boundary values, rather than using integral + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, + * and `Window.currentRow` to specify special boundary values, rather than using integral * values directly. * * A row based boundary is based on the position of the row within the partition. @@ -148,9 +148,9 @@ object Window { * }}} * * @param start boundary start, inclusive. The frame is unbounded if this is - * the minimum long value ([[Window.unboundedPreceding]]). + * the minimum long value (`Window.unboundedPreceding`). * @param end boundary end, inclusive. The frame is unbounded if this is the - * maximum long value ([[Window.unboundedFollowing]]). + * maximum long value (`Window.unboundedFollowing`). * @since 2.1.0 */ // Note: when updating the doc for this method, also update WindowSpec.rowsBetween. @@ -166,8 +166,8 @@ object Window { * while "-1" means one off before the current row, and "5" means the five off after the * current row. * - * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]], - * and [[Window.currentRow]] to specify special boundary values, rather than using integral + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, + * and `Window.currentRow` to specify special boundary values, rather than using integral * values directly. * * A range based boundary is based on the actual value of the ORDER BY @@ -200,9 +200,9 @@ object Window { * }}} * * @param start boundary start, inclusive. The frame is unbounded if this is - * the minimum long value ([[Window.unboundedPreceding]]). + * the minimum long value (`Window.unboundedPreceding`). * @param end boundary end, inclusive. The frame is unbounded if this is the - * maximum long value ([[Window.unboundedFollowing]]). + * maximum long value (`Window.unboundedFollowing`). * @since 2.1.0 */ // Note: when updating the doc for this method, also update WindowSpec.rangeBetween. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala index 4a8ce695bd4da5028cb23c37d469f2455b30be00..de7d7a1772753d85715b22148de81f529f4ea7f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala @@ -85,8 +85,8 @@ class WindowSpec private[sql]( * "current row", while "-1" means the row before the current row, and "5" means the fifth row * after the current row. * - * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]], - * and [[Window.currentRow]] to specify special boundary values, rather than using integral + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, + * and `[Window.currentRow` to specify special boundary values, rather than using integral * values directly. * * A row based boundary is based on the position of the row within the partition. @@ -116,9 +116,9 @@ class WindowSpec private[sql]( * }}} * * @param start boundary start, inclusive. The frame is unbounded if this is - * the minimum long value ([[Window.unboundedPreceding]]). + * the minimum long value (`Window.unboundedPreceding`). * @param end boundary end, inclusive. The frame is unbounded if this is the - * maximum long value ([[Window.unboundedFollowing]]). + * maximum long value (`Window.unboundedFollowing`). * @since 1.4.0 */ // Note: when updating the doc for this method, also update Window.rowsBetween. @@ -133,8 +133,8 @@ class WindowSpec private[sql]( * while "-1" means one off before the current row, and "5" means the five off after the * current row. * - * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]], - * and [[Window.currentRow]] to specify special boundary values, rather than using integral + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, + * and `[Window.currentRow` to specify special boundary values, rather than using integral * values directly. * * A range based boundary is based on the actual value of the ORDER BY @@ -167,9 +167,9 @@ class WindowSpec private[sql]( * }}} * * @param start boundary start, inclusive. The frame is unbounded if this is - * the minimum long value ([[Window.unboundedPreceding]]). + * the minimum long value (`Window.unboundedPreceding`). * @param end boundary end, inclusive. The frame is unbounded if this is the - * maximum long value ([[Window.unboundedFollowing]]). + * maximum long value (`Window.unboundedFollowing`). * @since 1.4.0 */ // Note: when updating the doc for this method, also update Window.rangeBetween. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala index aa71cb9e3bc857dca5cab326382cf2334fea91c0..650ffd458659208bf2a037e04c78a390122b8bb6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.execution.aggregate._ /** * :: Experimental :: - * Type-safe functions available for [[Dataset]] operations in Scala. + * Type-safe functions available for `Dataset` operations in Scala. * * Java users should use [[org.apache.spark.sql.expressions.javalang.typed]]. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala index bc9788d81fe6afbc01ef8b2e0742b622327dc4b2..4976b875fa298a20dd03d76233481161a22ad1c1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala @@ -32,9 +32,9 @@ import org.apache.spark.sql.types._ abstract class UserDefinedAggregateFunction extends Serializable { /** - * A [[StructType]] represents data types of input arguments of this aggregate function. + * A `StructType` represents data types of input arguments of this aggregate function. * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments - * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like + * with type of `DoubleType` and `LongType`, the returned `StructType` will look like * * ``` * new StructType() @@ -42,7 +42,7 @@ abstract class UserDefinedAggregateFunction extends Serializable { * .add("longInput", LongType) * ``` * - * The name of a field of this [[StructType]] is only used to identify the corresponding + * The name of a field of this `StructType` is only used to identify the corresponding * input argument. Users can choose names to identify the input arguments. * * @since 1.5.0 @@ -50,10 +50,10 @@ abstract class UserDefinedAggregateFunction extends Serializable { def inputSchema: StructType /** - * A [[StructType]] represents data types of values in the aggregation buffer. + * A `StructType` represents data types of values in the aggregation buffer. * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values - * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]], - * the returned [[StructType]] will look like + * (i.e. two intermediate values) with type of `DoubleType` and `LongType`, + * the returned `StructType` will look like * * ``` * new StructType() @@ -61,7 +61,7 @@ abstract class UserDefinedAggregateFunction extends Serializable { * .add("longInput", LongType) * ``` * - * The name of a field of this [[StructType]] is only used to identify the corresponding + * The name of a field of this `StructType` is only used to identify the corresponding * buffer value. Users can choose names to identify the input arguments. * * @since 1.5.0 @@ -69,7 +69,7 @@ abstract class UserDefinedAggregateFunction extends Serializable { def bufferSchema: StructType /** - * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]]. + * The `DataType` of the returned value of this [[UserDefinedAggregateFunction]]. * * @since 1.5.0 */ @@ -121,7 +121,7 @@ abstract class UserDefinedAggregateFunction extends Serializable { def evaluate(buffer: Row): Any /** - * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments. + * Creates a `Column` for this UDAF using given `Column`s as input arguments. * * @since 1.5.0 */ @@ -136,8 +136,8 @@ abstract class UserDefinedAggregateFunction extends Serializable { } /** - * Creates a [[Column]] for this UDAF using the distinct values of the given - * [[Column]]s as input arguments. + * Creates a `Column` for this UDAF using the distinct values of the given + * `Column`s as input arguments. * * @since 1.5.0 */ @@ -153,7 +153,7 @@ abstract class UserDefinedAggregateFunction extends Serializable { } /** - * A [[Row]] representing a mutable aggregation buffer. + * A `Row` representing a mutable aggregation buffer. * * This is not meant to be extended outside of Spark. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala index 7c64e28d24724f39215d6b28783bbc0f7459adbc..83857c322a0ecf1abbaa943085ee22c07e5faf16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala @@ -40,7 +40,7 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int) * SQL dialect of a certain database or jdbc driver. * Lots of databases define types that aren't explicitly supported * by the JDBC spec. Some JDBC drivers also report inaccurate - * information---for instance, BIT(n>1) being reported as a BIT type is quite + * information---for instance, BIT(n>1) being reported as a BIT type is quite * common, even though BIT in JDBC is meant for single-bit values. Also, there * does not appear to be a standard name for an unbounded string or binary * type; we use BLOB and CLOB by default but override with database-specific @@ -134,7 +134,7 @@ abstract class JdbcDialect extends Serializable { /** * :: DeveloperApi :: - * Registry of dialects that apply to every new jdbc [[org.apache.spark.sql.DataFrame]]. + * Registry of dialects that apply to every new jdbc `org.apache.spark.sql.DataFrame`. * * If multiple matching dialects are registered then all matching ones will be * tried in reverse order. A user-added dialect will thus be applied first, @@ -148,7 +148,7 @@ abstract class JdbcDialect extends Serializable { object JdbcDialects { /** - * Register a dialect for use on all new matching jdbc [[org.apache.spark.sql.DataFrame]]. + * Register a dialect for use on all new matching jdbc `org.apache.spark.sql.DataFrame`. * Reading an existing dialect will cause a move-to-front. * * @param dialect The new dialect. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 40b482e4c01a524685f05e0edf824f90178a29fc..c50733534e2b58dbe6f749cb5ff6c5ab92bb2628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -27,8 +27,8 @@ import org.apache.spark.sql.execution.streaming.StreamingRelation import org.apache.spark.sql.types.StructType /** - * Interface used to load a streaming [[Dataset]] from external storage systems (e.g. file systems, - * key-value stores, etc). Use [[SparkSession.readStream]] to access this. + * Interface used to load a streaming `Dataset` from external storage systems (e.g. file systems, + * key-value stores, etc). Use `SparkSession.readStream` to access this. * * @since 2.0.0 */ @@ -109,7 +109,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** - * Loads input data stream in as a [[DataFrame]], for data streams that don't require a path + * Loads input data stream in as a `DataFrame`, for data streams that don't require a path * (e.g. external key-value stores). * * @since 2.0.0 @@ -125,7 +125,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } /** - * Loads input in as a [[DataFrame]], for data streams that read from some path. + * Loads input in as a `DataFrame`, for data streams that read from some path. * * @since 2.0.0 */ @@ -134,8 +134,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } /** - * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited - * JSON]]) and returns the result as a [[DataFrame]]. + * Loads a JSON file stream (<a href="http://jsonlines.org/">JSON Lines text format or + * newline-delimited JSON</a>) and returns the result as a `DataFrame`. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. @@ -181,7 +181,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo def json(path: String): DataFrame = format("json").load(path) /** - * Loads a CSV file stream and returns the result as a [[DataFrame]]. + * Loads a CSV file stream and returns the result as a `DataFrame`. * * This function will go through the input once to determine the input schema if `inferSchema` * is enabled. To avoid going through the entire data once, disable `inferSchema` option or @@ -243,7 +243,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo def csv(path: String): DataFrame = format("csv").load(path) /** - * Loads a Parquet file stream, returning the result as a [[DataFrame]]. + * Loads a Parquet file stream, returning the result as a `DataFrame`. * * You can set the following Parquet-specific option(s) for reading Parquet files: * <ul> @@ -262,7 +262,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } /** - * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named + * Loads text files and returns a `DataFrame` whose schema starts with a string column named * "value", and followed by partitioned columns if there are any. * * Each line in the text files is a new row in the resulting DataFrame. For example: @@ -285,7 +285,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo def text(path: String): DataFrame = format("text").load(path) /** - * Loads text file(s) and returns a [[Dataset]] of String. The underlying schema of the Dataset + * Loads text file(s) and returns a `Dataset` of String. The underlying schema of the Dataset * contains a single string column named "value". * * If the directory structure of the text files contains partitioning information, those are diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index daed1dcb7737026a2617c054abd8fc2861acd24b..b3c600ae53dbbeaff7ebcb1d451a5d1da8865061 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -26,8 +26,8 @@ import org.apache.spark.sql.execution.streaming.{ForeachSink, MemoryPlan, Memory /** * :: Experimental :: - * Interface used to write a streaming [[Dataset]] to external storage systems (e.g. file systems, - * key-value stores, etc). Use [[Dataset.writeStream]] to access this. + * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems, + * key-value stores, etc). Use `Dataset.writeStream` to access this. * * @since 2.0.0 */ @@ -273,8 +273,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Starts the execution of the streaming query, which will continually send results to the given - * [[ForeachWriter]] as as new data arrives. The [[ForeachWriter]] can be used to send the data - * generated by the [[DataFrame]]/[[Dataset]] to an external system. + * `ForeachWriter` as as new data arrives. The `ForeachWriter` can be used to send the data + * generated by the `DataFrame`/`Dataset` to an external system. * * Scala example: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala index 0a85414451981e56ee50c88931b841b919835062..374313f2ca9ab06132adc35773563755064861b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala @@ -31,7 +31,7 @@ trait StreamingQuery { /** * Returns the name of the query. This name is unique across all active queries. This can be - * set in the [[org.apache.spark.sql.DataStreamWriter DataStreamWriter]] as + * set in the `org.apache.spark.sql.streaming.DataStreamWriter` as * `dataframe.writeStream.queryName("query").start()`. * @since 2.0.0 */ @@ -45,7 +45,7 @@ trait StreamingQuery { def id: Long /** - * Returns the [[SparkSession]] associated with `this`. + * Returns the `SparkSession` associated with `this`. * @since 2.0.0 */ def sparkSession: SparkSession @@ -90,10 +90,11 @@ trait StreamingQuery { * immediately (if the query was terminated by `stop()`), or throw the exception * immediately (if the query has terminated with exception). * - * @throws StreamingQueryException, if `this` query has terminated with an exception. + * @throws StreamingQueryException if the query has terminated with an exception. * * @since 2.0.0 */ + @throws[StreamingQueryException] def awaitTermination(): Unit /** @@ -106,10 +107,11 @@ trait StreamingQuery { * `true` immediately (if the query was terminated by `stop()`), or throw the exception * immediately (if the query has terminated with exception). * - * @throws StreamingQueryException, if `this` query has terminated with an exception + * @throws StreamingQueryException if the query has terminated with an exception * * @since 2.0.0 */ + @throws[StreamingQueryException] def awaitTermination(timeoutMs: Long): Boolean /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index bba7bc753eea90579ddbf39021add7c2919436ff..53968a82d8e226efeca71d6809a8c2b3a5846563 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -31,7 +31,7 @@ import org.apache.spark.util.{Clock, SystemClock, Utils} /** * :: Experimental :: - * A class to manage all the [[StreamingQuery]] active on a [[SparkSession]]. + * A class to manage all the [[StreamingQuery]] active on a `SparkSession`. * * @since 2.0.0 */ @@ -81,10 +81,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) { * users need to stop all of them after any of them terminates with exception, and then check the * `query.exception()` for each query. * - * @throws StreamingQueryException, if any query has terminated with an exception + * @throws StreamingQueryException if any query has terminated with an exception * * @since 2.0.0 */ + @throws[StreamingQueryException] def awaitAnyTermination(): Unit = { awaitTerminationLock.synchronized { while (lastTerminatedQuery == null) { @@ -113,10 +114,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) { * users need to stop all of them after any of them terminates with exception, and then check the * `query.exception()` for each query. * - * @throws StreamingQueryException, if any query has terminated with an exception + * @throws StreamingQueryException if any query has terminated with an exception * * @since 2.0.0 */ + @throws[StreamingQueryException] def awaitAnyTermination(timeoutMs: Long): Boolean = { val startTime = System.currentTimeMillis diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala index 4504582187b97e8fd8274a776b7a04c6fd7a4830..26ad0eadd9d4c1b9d94ea743c21987db9f8f9900 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala @@ -68,7 +68,7 @@ trait QueryExecutionListener { /** * :: Experimental :: * - * Manager for [[QueryExecutionListener]]. See [[org.apache.spark.sql.SQLContext.listenerManager]]. + * Manager for [[QueryExecutionListener]]. See `org.apache.spark.sql.SQLContext.listenerManager`. */ @Experimental @InterfaceStability.Evolving diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index e333fc7febc2a17a435dcd078a57cfd7998ed18c..a2d64da0012f148d8f019f149711f463ec0cb9a1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -57,9 +57,9 @@ import org.apache.spark.util.SerializableJobConf * @param partition a map from the partition key to the partition value (optional). If the partition * value is optional, dynamic partition insert will be performed. * As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have - * Map('a' -> Some('1'), 'b' -> Some('2')), + * Map('a' -> Some('1'), 'b' -> Some('2')), * and `INSERT INTO tbl PARTITION (a=1, b) AS ...` - * would have Map('a' -> Some('1'), 'b' -> None). + * would have Map('a' -> Some('1'), 'b' -> None). * @param child the logical plan representing data to write to. * @param overwrite overwrite existing table or partitions. * @param ifNotExists If true, only write if the table or partition does not exist. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index 42c92ed5cae267d167c7d55ce23f385d8755067f..0a7631f7821933a22955d9830302c9eef68294cb 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -42,8 +42,8 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration /** - * [[FileFormat]] for reading ORC files. If this is moved or renamed, please update - * [[DataSource]]'s backwardCompatibilityMap. + * `FileFormat` for reading ORC files. If this is moved or renamed, please update + * `DataSource`'s backwardCompatibilityMap. */ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala index f5db73b715820679abdbe4182a9b52abca375dfd..3f1f86c278db008a54014d740c4754e6b6d71928 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala @@ -38,7 +38,7 @@ private[orc] object OrcFileOperator extends Logging { * 1. Retrieving file metadata (schema and compression codecs, etc.) * 2. Read the actual file content (in this case, the given path should point to the target file) * - * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct<></code) to an + * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct<></code>) to an * ORC file if the file contains zero rows. This is OK for Hive since the schema of the * table is managed by metastore. But this becomes a problem when reading ORC files * directly from HDFS via Spark SQL, because we have to discover the schema from raw ORC