Skip to content
Snippets Groups Projects
Commit 8c32b2e8 authored by Feynman Liang's avatar Feynman Liang Committed by Xiangrui Meng
Browse files

[SPARK-8877] [MLLIB] Public API for association rule generation

Adds FPGrowth.generateAssociationRules to public API for generating association rules after mining frequent itemsets.

Author: Feynman Liang <fliang@databricks.com>

Closes #7271 from feynmanliang/SPARK-8877 and squashes the following commits:

83b8baf [Feynman Liang] Add API Doc
867abff [Feynman Liang] Add FPGrowth.generateAssociationRules and change access modifiers for AssociationRules
parent 381cb161
No related branches found
No related tags found
No related merge requests found
......@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
* association rules which have a single item as the consequent.
*/
@Experimental
class AssociationRules private (
class AssociationRules private[fpm] (
private var minConfidence: Double) extends Logging with Serializable {
/**
......@@ -45,6 +45,7 @@ class AssociationRules private (
* Sets the minimal confidence (default: `0.8`).
*/
def setMinConfidence(minConfidence: Double): this.type = {
require(minConfidence >= 0.0 && minConfidence <= 1.0)
this.minConfidence = minConfidence
this
}
......@@ -91,7 +92,7 @@ object AssociationRules {
* @tparam Item item type
*/
@Experimental
class Rule[Item] private[mllib] (
class Rule[Item] private[fpm] (
val antecedent: Array[Item],
val consequent: Array[Item],
freqUnion: Double,
......
......@@ -40,7 +40,16 @@ import org.apache.spark.storage.StorageLevel
* @tparam Item item type
*/
@Experimental
class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable
class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
/**
* Generates association rules for the [[Item]]s in [[freqItemsets]].
* @param confidence minimal confidence of the rules produced
*/
def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = {
val associationRules = new AssociationRules(confidence)
associationRules.run(freqItemsets)
}
}
/**
* :: Experimental ::
......
......@@ -132,6 +132,48 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
assert(model1.freqItemsets.count() === 625)
}
test("FP-Growth String type association rule generation") {
val transactions = Seq(
"r z h k p",
"z y x w v u t s",
"s x o n r",
"x z y m t s q e",
"z",
"x z y r q t p")
.map(_.split(" "))
val rdd = sc.parallelize(transactions, 2).cache()
/* Verify results using the `R` code:
transactions = as(sapply(
list("r z h k p",
"z y x w v u t s",
"s x o n r",
"x z y m t s q e",
"z",
"x z y r q t p"),
FUN=function(x) strsplit(x," ",fixed=TRUE)),
"transactions")
ars = apriori(transactions,
parameter = list(support = 0.0, confidence = 0.5, target="rules", minlen=2))
arsDF = as(ars, "data.frame")
arsDF$support = arsDF$support * length(transactions)
names(arsDF)[names(arsDF) == "support"] = "freq"
> nrow(arsDF)
[1] 23
> sum(arsDF$confidence == 1)
[1] 23
*/
val rules = (new FPGrowth())
.setMinSupport(0.5)
.setNumPartitions(2)
.run(rdd)
.generateAssociationRules(0.9)
.collect()
assert(rules.size === 23)
assert(rules.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
}
test("FP-Growth using Int type") {
val transactions = Seq(
"1 2 3",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment