Skip to content
Snippets Groups Projects
Commit 3bb21775 authored by Kashif Rasul's avatar Kashif Rasul Committed by Xiangrui Meng
Browse files

[SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite

Author: Kashif Rasul <kashif.rasul@gmail.com>

Closes #7269 from kashif/SPARK-8872 and squashes the following commits:

2d5457f [Kashif Rasul] added R code for FP Int type
3de6808 [Kashif Rasul] added verification results from R for FPGrowthSuite
parent 8a9d9cc1
No related branches found
No related tags found
No related merge requests found
......@@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.9)
.setNumPartitions(1)
.run(rdd)
/* Verify results using the `R` code:
transactions = as(sapply(
list("r z h k p",
"z y x w v u t s",
"s x o n r",
"x z y m t s q e",
"z",
"x z y r q t p"),
FUN=function(x) strsplit(x," ",fixed=TRUE)),
"transactions")
> eclat(transactions, parameter = list(support = 0.9))
...
eclat - zero frequent items
set of 0 itemsets
*/
assert(model6.freqItemsets.count() === 0)
val model3 = fpg
......@@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
(itemset.items.toSet, itemset.freq)
}
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.5))
fpDF = as(sort(fp), "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> fpDF
items freq
13 {z} 5
14 {x} 4
1 {s,x} 3
2 {t,x,y,z} 3
3 {t,y,z} 3
4 {t,x,y} 3
5 {x,y,z} 3
6 {y,z} 3
7 {x,y} 3
8 {t,y} 3
9 {t,x,z} 3
10 {t,z} 3
11 {t,x} 3
12 {x,z} 3
15 {t} 3
16 {y} 3
17 {s} 3
18 {r} 3
*/
val expected = Set(
(Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
(Set("r"), 3L),
......@@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.3)
.setNumPartitions(4)
.run(rdd)
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.3))
fpDF = as(fp, "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> nrow(fpDF)
[1] 54
*/
assert(model2.freqItemsets.count() === 54)
val model1 = fpg
.setMinSupport(0.1)
.setNumPartitions(8)
.run(rdd)
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.1))
fpDF = as(fp, "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> nrow(fpDF)
[1] 625
*/
assert(model1.freqItemsets.count() === 625)
}
......@@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.9)
.setNumPartitions(1)
.run(rdd)
/* Verify results using the `R` code:
transactions = as(sapply(
list("1 2 3",
"1 2 3 4",
"5 4 3 2 1",
"6 5 4 3 2 1",
"2 4",
"1 3",
"1 7"),
FUN=function(x) strsplit(x," ",fixed=TRUE)),
"transactions")
> eclat(transactions, parameter = list(support = 0.9))
...
eclat - zero frequent items
set of 0 itemsets
*/
assert(model6.freqItemsets.count() === 0)
val model3 = fpg
......@@ -100,6 +178,24 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
(itemset.items.toSet, itemset.freq)
}
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.5))
fpDF = as(sort(fp), "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> fpDF
items freq
6 {1} 6
3 {1,3} 5
7 {2} 5
8 {3} 5
1 {2,4} 4
2 {1,2,3} 4
4 {2,3} 4
5 {1,2} 4
9 {4} 4
*/
val expected = Set(
(Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L),
(Set(1, 2), 4L), (Set(1, 3), 5L), (Set(2, 3), 4L),
......@@ -110,12 +206,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.3)
.setNumPartitions(4)
.run(rdd)
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.3))
fpDF = as(fp, "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> nrow(fpDF)
[1] 15
*/
assert(model2.freqItemsets.count() === 15)
val model1 = fpg
.setMinSupport(0.1)
.setNumPartitions(8)
.run(rdd)
/* Verify results using the `R` code:
fp = eclat(transactions, parameter = list(support = 0.1))
fpDF = as(fp, "data.frame")
fpDF$support = fpDF$support * length(transactions)
names(fpDF)[names(fpDF) == "support"] = "freq"
> nrow(fpDF)
[1] 65
*/
assert(model1.freqItemsets.count() === 65)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment