diff --git a/python/examples/logistic_regression.py b/python/examples/logistic_regression.py index 3ac1bae4e9ad3d9852b7e7f13dbe57414a1bf4e6..1117dea5380e764a82174e21ca1b0bfbf9b9b974 100755 --- a/python/examples/logistic_regression.py +++ b/python/examples/logistic_regression.py @@ -16,7 +16,8 @@ # """ -This example requires numpy (http://www.numpy.org/) +A logistic regression implementation that uses NumPy (http://www.numpy.org) to act on batches +of input data using efficient matrix operations. """ from collections import namedtuple from math import exp @@ -27,47 +28,45 @@ import numpy as np from pyspark import SparkContext -N = 100000 # Number of data points D = 10 # Number of dimensions -R = 0.7 # Scaling factor -ITERATIONS = 5 -np.random.seed(42) -DataPoint = namedtuple("DataPoint", ['x', 'y']) -from logistic_regression import DataPoint # So that DataPoint is properly serialized - - -def generateData(): - def generatePoint(i): - y = -1 if i % 2 == 0 else 1 - x = np.random.normal(size=D) + (y * R) - return DataPoint(x, y) - return [generatePoint(i) for i in range(N)] - +# Read a batch of points from the input file into a NumPy matrix object. We operate on batches to +# make further computations faster. +# The data file contains lines of the form <label> <x1> <x2> ... <xD>. We load each block of these +# into a NumPy array of size numLines * (D + 1) and pull out column 0 vs the others in gradient(). +def readPointBatch(iterator): + strs = list(iterator) + matrix = np.zeros((len(strs), D + 1)) + for i in xrange(len(strs)): + matrix[i] = np.fromstring(strs[i].replace(',', ' '), dtype=np.float32, sep=' ') + return [matrix] if __name__ == "__main__": - if len(sys.argv) == 1: - print >> sys.stderr, "Usage: logistic_regression <master> [<slices>]" + if len(sys.argv) != 4: + print >> sys.stderr, "Usage: logistic_regression <master> <file> <iters>" exit(-1) sc = SparkContext(sys.argv[1], "PythonLR", pyFiles=[realpath(__file__)]) - slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 - points = sc.parallelize(generateData(), slices).cache() + points = sc.textFile(sys.argv[2]).mapPartitions(readPointBatch).cache() + iterations = int(sys.argv[3]) # Initialize w to a random value w = 2 * np.random.ranf(size=D) - 1 print "Initial w: " + str(w) + # Compute logistic regression gradient for a matrix of data points + def gradient(matrix, w): + Y = matrix[:,0] # point labels (first column of input file) + X = matrix[:,1:] # point coordinates + # For each point (x, y), compute gradient function, then sum these up + return ((1.0 / (1.0 + np.exp(-Y * X.dot(w))) - 1.0) * Y * X.T).sum(1) + def add(x, y): x += y return x - for i in range(1, ITERATIONS + 1): - print "On iteration %i" % i - - gradient = points.map(lambda p: - (1.0 / (1.0 + exp(-p.y * np.dot(w, p.x)))) * p.y * p.x - ).reduce(add) - w -= gradient + for i in range(iterations): + print "On iteration %i" % (i + 1) + w -= points.map(lambda m: gradient(m, w)).reduce(add) print "Final w: " + str(w)