Skip to content
Snippets Groups Projects
Commit bf7033f3 authored by Ginger Smith's avatar Ginger Smith
Browse files

fixing formatting, style, and input

parent 8c8947e2
No related branches found
No related tags found
No related merge requests found
...@@ -28,32 +28,32 @@ import spark.mllib.util.MLUtils ...@@ -28,32 +28,32 @@ import spark.mllib.util.MLUtils
* Generate RDD(s) containing data for Matrix Factorization. * Generate RDD(s) containing data for Matrix Factorization.
* *
* This method samples training entries according to the oversampling factor * This method samples training entries according to the oversampling factor
* 'tr_samp_fact', which is a multiplicative factor of the number of * 'trainSampFact', which is a multiplicative factor of the number of
* degrees of freedom of the matrix: rank*(m+n-rank). * degrees of freedom of the matrix: rank*(m+n-rank).
* *
* It optionally samples entries for a testing matrix using * It optionally samples entries for a testing matrix using
* 'te_samp_fact', the percentage of the number of training entries * 'testSampFact', the percentage of the number of training entries
* to use for testing. * to use for testing.
* *
* This method takes the following inputs: * This method takes the following inputs:
* sparkMaster (String) The master URL. * sparkMaster (String) The master URL.
* outputPath (String) Directory to save output. * outputPath (String) Directory to save output.
* m (Int) Number of rows in data matrix. * m (Int) Number of rows in data matrix.
* n (Int) Number of columns in data matrix. * n (Int) Number of columns in data matrix.
* rank (Int) Underlying rank of data matrix. * rank (Int) Underlying rank of data matrix.
* tr_samp_fact (Double) Oversampling factor. * trainSampFact (Double) Oversampling factor.
* noise (Boolean) Whether to add gaussian noise to training data. * noise (Boolean) Whether to add gaussian noise to training data.
* sigma (Double) Standard deviation of added gaussian noise. * sigma (Double) Standard deviation of added gaussian noise.
* test (Boolean) Whether to create testing RDD. * test (Boolean) Whether to create testing RDD.
* te_samp_fact (Double) Percentage of training data to use as test data. * testSampFact (Double) Percentage of training data to use as test data.
*/ */
object MFDataGenerator{ object MFDataGenerator{
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length != 10) { if (args.length < 2) {
println("Usage: MFGenerator " + println("Usage: MFDataGenerator " +
"<master> <output_dir> <m> <n> <rank> <tr_samp_fact> <noise> <sigma> <test> <te_samp_fact>") "<master> <outputDir> [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]")
System.exit(1) System.exit(1)
} }
...@@ -62,51 +62,52 @@ object MFDataGenerator{ ...@@ -62,51 +62,52 @@ object MFDataGenerator{
val m: Int = if (args.length > 2) args(2).toInt else 100 val m: Int = if (args.length > 2) args(2).toInt else 100
val n: Int = if (args.length > 3) args(3).toInt else 100 val n: Int = if (args.length > 3) args(3).toInt else 100
val rank: Int = if (args.length > 4) args(4).toInt else 10 val rank: Int = if (args.length > 4) args(4).toInt else 10
val tr_samp_fact: Double = if (args.length > 5) args(5).toDouble else 1.0 val trainSampFact: Double = if (args.length > 5) args(5).toDouble else 1.0
val noise: Boolean = if (args.length > 6) args(6).toBoolean else false val noise: Boolean = if (args.length > 6) args(6).toBoolean else false
val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1 val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1
val test: Boolean = if (args.length > 8) args(8).toBoolean else false val test: Boolean = if (args.length > 8) args(8).toBoolean else false
val te_samp_fact: Double = if (args.length > 9) args(9).toDouble else 0.1 val testSampFact: Double = if (args.length > 9) args(9).toDouble else 0.1
val sc = new SparkContext(sparkMaster, "MFDataGenerator") val sc = new SparkContext(sparkMaster, "MFDataGenerator")
val A = DoubleMatrix.randn(m,rank) val A = DoubleMatrix.randn(m, rank)
val B = DoubleMatrix.randn(rank,n) val B = DoubleMatrix.randn(rank, n)
val z = 1/(scala.math.sqrt(scala.math.sqrt(rank))) val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank)))
A.mmuli(z) A.mmuli(z)
B.mmuli(z) B.mmuli(z)
val fullData = A.mmul(B) val fullData = A.mmul(B)
val df = rank*(m+n-rank) val df = rank * (m + n - rank)
val sampsize = scala.math.min(scala.math.round(tr_samp_fact*df), scala.math.round(.99*m*n)).toInt val sampSize = scala.math.min(scala.math.round(trainSampFact * df),
scala.math.round(.99 * m * n)).toInt
val rand = new Random() val rand = new Random()
val mn = m*n val mn = m * n
val shuffled = rand.shuffle(1 to mn toIterable) val shuffled = rand.shuffle(1 to mn toIterable)
val omega = shuffled.slice(0,sampsize) val omega = shuffled.slice(0, sampSize)
val ordered = omega.sortWith(_ < _).toArray val ordered = omega.sortWith(_ < _).toArray
val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered) val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered)
.map(x => (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1))) .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
// optionally add gaussian noise // optionally add gaussian noise
if(noise){ if (noise) {
trainData.map(x => (x._1,x._2,x._3+rand.nextGaussian*sigma)) trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma))
} }
trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
// optionally generate testing data // optionally generate testing data
if(test){ if (test) {
val test_sampsize = scala.math val testSampSize = scala.math
.min(scala.math.round(sampsize*te_samp_fact),scala.math.round(mn-sampsize)) .min(scala.math.round(sampSize * testSampFact),scala.math.round(mn - sampSize)).toInt
.toInt val testOmega = shuffled.slice(sampSize, sampSize + testSampSize)
val test_omega = shuffled.slice(sampsize,sampsize+test_sampsize) val testOrdered = testOmega.sortWith(_ < _).toArray
val test_ordered = test_omega.sortWith(_ < _).toArray val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered)
val testData: RDD[(Int, Int, Double)] = sc.parallelize(test_ordered) .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
.map(x=> (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1)))
testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
} }
sc.stop() sc.stop()
} }
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment