diff --git a/apache-spark/README.md b/apache-spark/README.md index 70df5b644420d0ccb5cc74a869229547420dc1b8..0e22cc9e0adf81f24075379f1226fadf9754f2df 100644 --- a/apache-spark/README.md +++ b/apache-spark/README.md @@ -1,27 +1,27 @@ -## Prerequisites: +# Prerequisites: -# Java 7+ installed on master & slave machines -# Scala 2+ installed on master & slave machines -# PySpark installed on master machine (`sudo pip install pyspark`) -# apache-spark project & streaming files cloned on each slave machine +## Java 7+ installed on master & slave machines +## Scala 2+ installed on master & slave machines +## PySpark installed on master machine (`sudo pip install pyspark`) +## apache-spark project & streaming files cloned on each slave machine -## Start up Spark cluster +# Start up Spark cluster ``` ./sbin/start-all.sh ``` -# This starts up VMs 2-10 as worker machines, cluster summary can be seen on http://MASTER-IP:8080/ +## This starts up VMs 2-10 as worker machines, cluster summary can be seen on http://MASTER-IP:8080/ -## Submit a word count job to the cluster +# Submit a word count job to the cluster ``` bin/spark-submit --master spark://MASTER-IP:7077 --deploy-mode client ~/apache-spark/python/wordcount.py ~/apache-spark/python/higgs-activity_time.txt ``` -# Job summary can be seen on http://MASTER-IP:4040/ +## Job summary can be seen on http://MASTER-IP:4040/ # Notes: -# 1. spark-env.sh file within conf/ should be modified as per job & master/slave requirements -# 2. /etc/hosts file on master & slave should have IP addresses resolved to the same hostnames within the slaves file in conf/ +## 1. `spark-env.sh` file within `conf/` should be modified as per job & master/slave requirements +## 2. `/etc/hosts` file on master & slave should have IP addresses resolved to the same hostnames within the slaves file in `conf/`