From e51345e1e04e439827a07c95887d14ba38333057 Mon Sep 17 00:00:00 2001 From: Holden Karau <holden@pigscanfly.ca> Date: Thu, 17 Sep 2015 09:17:43 -0700 Subject: [PATCH] [SPARK-10077] [DOCS] [ML] Add package info for java of ml/feature Should be the same as SPARK-7808 but use Java for the code example. It would be great to add package doc for `spark.ml.feature`. Author: Holden Karau <holden@pigscanfly.ca> Closes #8740 from holdenk/SPARK-10077-JAVA-PACKAGE-DOC-FOR-SPARK.ML.FEATURE. --- .../apache/spark/ml/feature/package-info.java | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java new file mode 100644 index 0000000000..c22d2e0cd2 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Feature transformers + * + * The `ml.feature` package provides common feature transformers that help convert raw data or + * features into more suitable forms for model fitting. + * Most feature transformers are implemented as {@link org.apache.spark.ml.Transformer}s, which + * transforms one {@link org.apache.spark.sql.DataFrame} into another, e.g., + * {@link org.apache.spark.feature.HashingTF}. + * Some feature transformers are implemented as {@link org.apache.spark.ml.Estimator}}s, because the + * transformation requires some aggregated information of the dataset, e.g., document + * frequencies in {@link org.apache.spark.ml.feature.IDF}. + * For those feature transformers, calling {@link org.apache.spark.ml.Estimator#fit} is required to + * obtain the model first, e.g., {@link org.apache.spark.ml.feature.IDFModel}, in order to apply + * transformation. + * The transformation is usually done by appending new columns to the input + * {@link org.apache.spark.sql.DataFrame}, so all input columns are carried over. + * + * We try to make each transformer minimal, so it becomes flexible to assemble feature + * transformation pipelines. + * {@link org.apache.spark.ml.Pipeline} can be used to chain feature transformers, and + * {@link org.apache.spark.ml.feature.VectorAssembler} can be used to combine multiple feature + * transformations, for example: + * + * <pre> + * <code> + * import java.util.Arrays; + * + * import org.apache.spark.api.java.JavaRDD; + * import static org.apache.spark.sql.types.DataTypes.*; + * import org.apache.spark.sql.types.StructType; + * import org.apache.spark.sql.DataFrame; + * import org.apache.spark.sql.RowFactory; + * import org.apache.spark.sql.Row; + * + * import org.apache.spark.ml.feature.*; + * import org.apache.spark.ml.Pipeline; + * import org.apache.spark.ml.PipelineStage; + * import org.apache.spark.ml.PipelineModel; + * + * // a DataFrame with three columns: id (integer), text (string), and rating (double). + * StructType schema = createStructType( + * Arrays.asList( + * createStructField("id", IntegerType, false), + * createStructField("text", StringType, false), + * createStructField("rating", DoubleType, false))); + * JavaRDD<Row> rowRDD = jsc.parallelize( + * Arrays.asList( + * RowFactory.create(0, "Hi I heard about Spark", 3.0), + * RowFactory.create(1, "I wish Java could use case classes", 4.0), + * RowFactory.create(2, "Logistic regression models are neat", 4.0))); + * DataFrame df = jsql.createDataFrame(rowRDD, schema); + * // define feature transformers + * RegexTokenizer tok = new RegexTokenizer() + * .setInputCol("text") + * .setOutputCol("words"); + * StopWordsRemover sw = new StopWordsRemover() + * .setInputCol("words") + * .setOutputCol("filtered_words"); + * HashingTF tf = new HashingTF() + * .setInputCol("filtered_words") + * .setOutputCol("tf") + * .setNumFeatures(10000); + * IDF idf = new IDF() + * .setInputCol("tf") + * .setOutputCol("tf_idf"); + * VectorAssembler assembler = new VectorAssembler() + * .setInputCols(new String[] {"tf_idf", "rating"}) + * .setOutputCol("features"); + * + * // assemble and fit the feature transformation pipeline + * Pipeline pipeline = new Pipeline() + * .setStages(new PipelineStage[] {tok, sw, tf, idf, assembler}); + * PipelineModel model = pipeline.fit(df); + * + * // save transformed features with raw data + * model.transform(df) + * .select("id", "text", "rating", "features") + * .write().format("parquet").save("/output/path"); + * </code> + * </pre> + * + * Some feature transformers implemented in MLlib are inspired by those implemented in scikit-learn. + * The major difference is that most scikit-learn feature transformers operate eagerly on the entire + * input dataset, while MLlib's feature transformers operate lazily on individual columns, + * which is more efficient and flexible to handle large and complex datasets. + * + * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html" target="_blank"> + * scikit-learn.preprocessing</a> + */ +package org.apache.spark.ml.feature; -- GitLab