From a052269f39d29c5cb7728e6b59a7293fa6da33eb Mon Sep 17 00:00:00 2001
From: leepei <leepei>
Date: Mon, 13 Jul 2015 03:00:33 -0700
Subject: [PATCH] update exp descriptions

---
 doc/Makefile           |   2 +-
 doc/disdca-cocoap.tex  | 377 -----------------------------------------
 doc/distcd-cogcomp.tex |  33 ++--
 doc/distcd.tex         |  16 +-
 4 files changed, 25 insertions(+), 403 deletions(-)
 delete mode 100644 doc/disdca-cocoap.tex

diff --git a/doc/Makefile b/doc/Makefile
index ccdadba..6326c44 100755
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,4 +1,4 @@
-FILES = distcd.pdf supple.pdf distcd-cogcomp.pdf disdca-cocoap.pdf
+FILES = distcd.pdf supple.pdf distcd-cogcomp.pdf
 TEX_DEP = *.tex *.bib
 .SUFFIXES: .tex .in .dvi .ps .pdf
 .PHONY: all
diff --git a/doc/disdca-cocoap.tex b/doc/disdca-cocoap.tex
deleted file mode 100644
index e18429a..0000000
--- a/doc/disdca-cocoap.tex
+++ /dev/null
@@ -1,377 +0,0 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%% ICML 2015 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\documentclass{article}
-\pdfoutput=1
-
-\usepackage[pointedenum]{paralist}
-\usepackage{enumitem,listings,color,verbatim}
-\usepackage{times}
-\usepackage{graphicx}
-\usepackage{subfigure}
-\usepackage{algorithm}
-\usepackage{xspace,amsmath}
-\usepackage{xr}
-\usepackage{hyperref}
-
-\DeclareMathOperator*{\argmax}{arg\,max}
-\newcommand{\bqoe}{$\mbox{{\sf BQO-E}}$\xspace}
-\def\R{{ \mathbf{R}}}
-\def\N{{ \mathbf{N}}}
-\def\P{{ \mathcal{P}}}
-\definecolor{dkgreen}{rgb}{0,0.6,0}
-\definecolor{listinggray}{gray}{0.9}
-\definecolor{lbcolor}{rgb}{0.9,0.9,0.9}
-\def\bx{{\boldsymbol x}}
-\def\bu{{\boldsymbol u}}
-\def\bw{{\boldsymbol w}}
-\def\bv{{\boldsymbol v}}
-\def\be{{\boldsymbol e}}
-\def\bd{{\boldsymbol d}}
-\def\bzero{{\boldsymbol 0}}
-\def\AL{{\boldsymbol{\alpha}}}
-\def\webspam{{\sf webspam}\xspace}
-\def\eps{{\sf epsilon}\xspace}
-\def\uu{{\sf url}\xspace}
-\def\tron{{\sf TRON}\xspace}
-\newcommand{\disdca}{$\mbox{{\sf DisDCA}}$\xspace}
-\newcommand{\cocoa}{$\mbox{{\sf CoCoA}}$\xspace}
-\newcommand{\mpi}{$\mbox{{\sf MPI-LIBLINEAR}}$\xspace}
-\newcommand{\dsvm}{$\mbox{{\sf DSVM-AVE}}$\xspace}
-\newcommand{\svmls}{$\mbox{{\sf DSVM-LS}}$\xspace}
-\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof
-\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
-\newtheorem{theorem}{Theorem}[section]
-\newtheorem{lemma}[theorem]{Lemma}
-\newtheorem{corollary}[theorem]{Corollary}
-\newtheorem{innercustomthm}{Theorem}
-\newenvironment{customthm}[1]
-{\renewcommand\theinnercustomthm{#1}\innercustomthm}
-{\endinnercustomthm}
-% Employ this version of the ``usepackage'' statement after the paper has
-% been accepted, when creating the final version.  This will set the
-% note in the first column to ``Proceedings of the...''
-\usepackage[accepted]{icml2015}
-
-\newcommand{\cocoap}{$\mbox{{\sf CoCoA+}}$\xspace}
-\newcommand{\birds}{$\mbox{{\sf Birds}}$\xspace}
-
-\icmltitlerunning{On the Equivalence of \cocoap and \disdca}
-
-\begin{document}
-\lstdefinelanguage{scala}{
-	morekeywords={%
-		abstract,case,catch,class,def,do,else,extends,%
-		false,final,finally,for,forSome,if,implicit,import,lazy,%
-		match,new,null,object,override,package,private,protected,%
-		return,sealed,super,this,throw,trait,true,try,type,%
-	val,var,while,with,yield},
-	otherkeywords={=>,<-,<\%,<:,>:,\#,@},
-		sensitive=true,
-		morecomment=[l]{//},
-		morecomment=[n]{/*}{*/},
-		morestring=[b]",
-		morestring=[b]',
-		morestring=[b]"""
-	}[keywords,comments,strings]
-
-	% activate the language and predefine settings
-%	\lstset{language=Scala}
-
-
-\twocolumn[
-\icmltitle{On the Equivalence of \cocoap and \disdca}
-\icmlauthor{Ching-pei Lee}{clee149@illinois.edu}
-\icmladdress{University of Illinois at Urbana-Champaign,
-201 N. Goodwin Avenue, Urbana, IL 61801 USA}
-]
-
-In this document, we show that the algorithm \cocoap
-\citep{CM15a} under the setting used in their experiments,
-which is also the best setting suggested by the
-authors that proposed this algorithm,
-is equivalent to the practical variant of \disdca
-\citep{TBY13a}.
-
-\section{Notations}
-Consider the problems being solved, we first unify the
-notation in the two papers.
-Given training instances $\{(x_i,y_i)\}_{i=1}^n$,
-The problem being solved are Problem (2) in \cite{CM15a}:
-\begin{equation}
-	\max_{\alpha \in \R^n} -\frac{1}{n}\sum_{j=1}^n
-	l^*_{j}\left(-\alpha_j\right) - \frac{\lambda}{2}
-	\|\frac{1}{\lambda n} A\alpha\|^2,
-	\label{eq:dual}
-\end{equation}
-	where $A = [x_1,x_2,\ldots,x_n]$,
-and Problem (2) in \cite{TBY13a}
-\begin{equation*}
-	\max_{\alpha \in \R^n} \frac{1}{n}\sum_{i=1}^n
-	-\phi^*_{i}\left(-\alpha_i\right) - \lambda g^*\left(
-	\frac{1}{\lambda n} \sum_{i=1}^n \alpha_i x_i\right).
-\end{equation*}
-By considering the special case $g^*(\cdot) = \frac{1}{2}\|
-\cdot \|^2$ and noting that $\phi^*_i$ and $l^*_i$ are just
-different notations for the same function,
-the two problems are equivalent.
-In the following analysis, we use the formulation of \eqref{eq:dual}.
-
-\section{Algorithms}
-\label{sec:alg}
-Assume there are $K$ machines disjointly storing the
-training instances,
-the index set of the instances in machine $k$ is defined as
-$P_k$, and for any vector $\bv \in \R^n$, define the vector
-$\bv_{[k]} \in \R^n$ by
-\begin{equation*}
-	\left(\bv_{[k]}\right)_i = \begin{cases}
-		0, &\text{ if } i \notin P_k,\\
-		\bv_i, &\text{ if } i \in P_k.
-	\end{cases}
-\end{equation*}
-We list the \cocoap algorithm under the best
-setting ($\sigma' = K, \gamma = 1$) suggested by the
-authors of \cite{CM15a} in Algorithm \ref{alg:cocoap},
-and list the algorithm of the practical variant of \disdca
-in Algorithm \ref{alg:disdca}.
-Note that for \disdca with $g(\bw) = \frac{1}{2}\bw^T \bw$,
-we have $\nabla g(\bw) = \bw$.
-
-Clearly, let
-\begin{equation*}
-	\Delta \alpha =  \delta,\quad \bu =
-	\bw^t + \frac{K}{\lambda n}A \Delta \alpha_{[k]},
-\end{equation*}
-and observe that $\AL$ in Algorithm \ref{alg:disdca} is equivalent to
-$\AL + \AL_{[k]}$ in Algorithm \ref{alg:cocoap} by their update rules in line 1.2.3 of both algorithms,
-\eqref{eq:cocoap} and \eqref{eq:disdca} are equivalent problems.
-Note that line 1.2.4 in Algorithm \ref{alg:disdca} indeed
-ensures $\bu = \bw^t + \frac{K}{\lambda n}A \Delta \alpha_{[k]}$.
-Thus, the two algorithms are identical.
-\begin{algorithm*}
-	\label{alg:cocoap}
-	\begin{itemize}
-		\item Input: number of iterations $T$, number of inner iterations $H$ for the local SDCA solver.
-		\item Let $\AL = \bzero$, $\bw^0 = \bzero$:
-		\item For $t=1,2,\ldots,T$:
-			\begin{enumerate}
-		\item Run the following process on the $K$ machines in parallel:
-			\begin{enumerate}
-					\item Let $\Delta \AL = \bzero$.
-					\item For $h=1,\ldots,H:$
-						\begin{enumerate}
-							\item Pick $i \in P_k$ uniformly at random.
-							\item Solve
-						\end{enumerate}
-								\begin{align}
-									\delta_i^*
-									&= \argmax_{\delta \in \R}\quad
-								- l_i^*\left(-\left(\alpha_i + \Delta \alpha_i\right) - \delta\right)
-				-\left(\bw^{t}\right)^T A \left(\Delta \alpha_{[k]} + \delta e_i\right)
-				- \frac{\lambda nK}{2}\| \frac{1}{\lambda n} A\left(\Delta \alpha_{[k]} + \delta e_i\right)\|^2\nonumber\\
-				&= \argmax_{\delta \in \R}\quad
-				- l_i^*\left(-\left(\alpha_i + \Delta \alpha_i \right) - \delta\right)
-				-\left(\bw^{t} + \frac{K}{\lambda n}A\Delta \alpha_{[k]}\right)^T x_i \delta
-				- \frac{K}{2 \lambda n}\|x_i\|^2 \delta^2
-				\label{eq:cocoap}
-			\end{align}
-	\begin{enumerate}
-		\item[1.2.3.] $\Delta \alpha_{[k]} \leftarrow \Delta \alpha_{[k]} + \delta_i^* e_i$.
-
-	\end{enumerate}
-\item Update $\alpha_{[k]} = \alpha_{[k]} + \Delta \alpha_{[k]}$
-	\end{enumerate}
-\item Obtain $\bw^{t+1} = \bw^t + \sum_{k=1}^K \frac{1}{\lambda n} A \Delta \AL_{[k]}$ on all machines
-	\end{enumerate}
-\end{itemize}
-	\caption{The \cocoap algorithm, under the setting suggested by the authors and also the setting used in the experiments of \cite{CM15a}.}
-\end{algorithm*}
-
-\begin{algorithm*}
-	\label{alg:disdca}
-	\begin{itemize}
-		\item Input: number of iterations $T$, number of inner iterations $H$ for the local SDCA solver.
-		\item Let $\AL = \bzero$, $\bw^0 = \bzero$:
-		\item For $t=1,2,\ldots,T$:
-			\begin{enumerate}
-		\item Run the following process on the $K$ machines in parallel:
-			\begin{enumerate}
-					\item Let $\bu = \bw^t$.
-					\item For $h=1,\ldots,H:$
-						\begin{enumerate}
-							\item Pick $i \in P_k$ uniformly at random.
-							\item Solve
-						\end{enumerate}
-								\begin{equation}
-									\Delta \AL_i
-				= \argmax_{\Delta \alpha \in \R}\quad
-				- l_i^*\left(-\alpha_i - \Delta \alpha\right)
-				-\bu^T  x_i \Delta \alpha
-				- \frac{K}{2 \lambda n}\|x_i\|^2 \left(\Delta \alpha\right)^2
-				\label{eq:disdca}
-			\end{equation}
-	\begin{enumerate}
-		\item[1.2.3.] $\alpha_i = \alpha_i + \Delta \alpha$.
-		\item[1.2.4.] $\bu = \bu + \frac{K}{\lambda n}\Delta x_i\alpha $.
-
-	\end{enumerate}
-	\end{enumerate}
-\item Obtain $\bw^{t+1} = \bw^t + \sum_{k=1}^K \frac{1}{\lambda n} A \Delta \AL_{[k]}$ on all machines
-	\end{enumerate}
-\end{itemize}
-\caption{The practical variant of the \disdca algorithm.}
-\end{algorithm*}
-
-\section{Implementation Comparison}
-Here we compare the codes \cocoap and \birds.
-\cocoap is the code released by the authors of \cite{CM15a} implementing their algorithm in Apache Spark.
-As indicated in \cite{CM15a}, it is available in \url{http://github.com/gingsmith/cocoa/}.
-\birds is the code released by the author of \cite{TBY13a} implementing their practical variant of \disdca proposed in that work using C++ and MPI.
-It is available at \url{http://homepage.cs.uiowa.edu/~tyng/software.html}.
-
-We excerpt the core part of the codes solving the local sub-problems to verify our argument in Section \ref{sec:alg}.
-Figure \ref{fig:cocoa} shows lines 171-201 of the file CoCoA.scala in \cocoap.
-Figure \ref{fig:disdca} shows lines 81-90 of the file inc\_dual.cc in \birds.
-
-Note that in Figure \ref{fig:cocoa}, the variable {\em plus} in \cocoap is true, and {\em sigma} is $K$ as suggested in the paper.
-In Figure \ref{fig:disdca}, the variable {\em coeff} is $K/(\lambda n)$,
-and the variable {\em mQ} is the value of $\bx_i^T \bx_i$.
-
-In the beginning, {\em deltaW} in line 174 of CoCoA.scala is $0$, so the value of
-{\em grad} after line 174
-and the
-{\em 1 - prediction} part in line 84 of inc\_dual.cc has the following relationship.
-\begin{equation}
-	\text{grad} = -\lambda n (1 - \text{prediction})
-	\label{eq:cocoa-disdca}
-\end{equation}
-Lines 181-187 of CoCoA.scala and line 85 of inc\_dual.cc are both projecting the variable back to a feasible region though the details are different.
-
-From Line 188 of CoCoA.scala, we see that {\em qii} is $K$ times of {\em Data.mQ(j)} in line 84 of inc\_dual.cc.
-Combining these factors,
-we have that
-$- ($grad $/$ qii$)$
-in line 191 of CoCoA.scala is the same as the first term of line 84 in inc\_dual.cc.
-\begin{align*}
-	-\text{grad}/\text{qii} &= -\frac{\left(y_i \bw^T \bx_i - 1\right) \lambda n}{ \bx_i^T \bx_i K}
-	= \frac{1 - y_i \bw^T \bx_i}{\bx_i^T \bx_i \cdot \frac{K}{\lambda n}}\\
-	&= \left(1 - \text{prediction}\right) / \left(\text{Data.mQ(j) * coeff}\right)
-\end{align*}
-Then line 199 in Figure \ref{fig:cocoa} and lines 87-88 in Figure \ref{fig:disdca} update the primal variables.
-Here a difference occurs.
-The update of WA in inc\_dual.cc is $K$ times larger than the update of deltaW in CoCoA.scala.
-Thus, in the next round of inc\_dual.cc,
-if we denote $\text{WA}_0$ as the value of WA in the previous round,
-and denote $K * \text{deltaW}$ as the vector being added to $\text{WA}_0$,
-we have that
-\begin{equation*}
-	Data.Xw\left(WA,j\right) = \left(\text{WA}_0 + K \text{deltaW}\right)^T \bx_j
-\end{equation*}
-which is exactly the same computation as line 174 of CoCoA.scala.
-Thus \eqref{eq:cocoa-disdca} still holds.
-Therefore, the algorithms behind the two implementations are identical.
-
-
-\begin{figure*}
-	\lstset{
-		backgroundcolor=\color{lbcolor},
-		tabsize=4,
-		language=scala,
-		captionpos=b,
-		tabsize=3,
-		frame=lines,
-		numbers=left,
-		numberstyle=\tiny,
-		numbersep=5pt,
-		firstnumber=171,
-		breaklines=true,
-		showstringspaces=false,
-		basicstyle=\footnotesize,
-		%  identifierstyle=\color{magenta},
-		keywordstyle=\color[rgb]{0,0,1},
-		commentstyle=\color{dkgreen},
-		stringstyle=\color{red}
-	}
-	\begin{lstlisting}
-      // compute hinge loss gradient
-      val grad = {
-        if (plus) {
-          (y*(x.dot(w)+sigma*x.dot(deltaW)) - 1.0)*(lambda*n)
-        } else {
-          (y*(x.dot(w)) - 1.0)*(lambda*n)
-        }
-      }
-
-      // compute projected gradient
-      var proj_grad = grad
-      if (alpha(idx) <= 0.0)
-        proj_grad = Math.min(grad,0)
-      else if (alpha(idx) >= 1.0)
-        proj_grad = Math.max(grad,0)
-
-      if (Math.abs(proj_grad) != 0.0 ) {
-        val qii = if (plus) x.dot(x)*sigma else x.dot(x)
-        var newAlpha = 1.0
-        if (qii != 0.0) {
-          newAlpha = Math.min(Math.max((alpha(idx) - (grad / qii)), 0.0), 1.0)
-        }
-
-        // update primal and dual variables
-        val update = x.times( y*(newAlpha-alpha(idx))/(lambda*n) )
-        if (!plus) {
-          w = update.plus(w)
-        }
-        deltaW = update.plus(deltaW)
-        alpha(idx) = newAlpha
-      }
-
-	\end{lstlisting}
-
-	\caption{CoCoA.scala in \cocoap}
-	\label{fig:cocoa}
-\end{figure*}
-\begin{figure*}
-	\lstset{
-		backgroundcolor=\color{lbcolor},
-		tabsize=4,
-		language=C++,
-		captionpos=b,
-		tabsize=3,
-		frame=lines,
-		numbers=left,
-		numberstyle=\tiny,
-		numbersep=5pt,
-		firstnumber=81,
-		breaklines=true,
-		showstringspaces=false,
-		basicstyle=\footnotesize,
-		%  identifierstyle=\color{magenta},
-		keywordstyle=\color[rgb]{0,0,1},
-		commentstyle=\color{dkgreen},
-		stringstyle=\color{red}
-	}
-	\begin{lstlisting}
-		int is_class = (Data.y<int>(j)==myclass)?1:-1;
-
-		double prediction = is_class*(Data.Xw(WA,j));
-		double v = (1 - prediction)/(Data.mQ(j)*coeff) + Alpha[j];
-		v=std::max(std::min(v,1.0), 0.0);
-		double del_alpha=v- Alpha[j];
-		if(Data.fmt()=="dense") WA.add(Data.dX(j), del_alpha*is_class*coeff);
-		else                    WA.add(Data.sX(j), del_alpha*is_class*coeff);
-		phi[0] += del_alpha;
-		Alpha[j]=v;
-	\end{lstlisting}
-
-	\caption{inc\_dual.cc in \birds}
-	\label{fig:disdca}
-\end{figure*}
-
-
-\bibliography{tmp}
-\bibliographystyle{icml2015}
-
-\end{document}
-
-
diff --git a/doc/distcd-cogcomp.tex b/doc/distcd-cogcomp.tex
index e8a4230..f4a8258 100644
--- a/doc/distcd-cogcomp.tex
+++ b/doc/distcd-cogcomp.tex
@@ -3,21 +3,17 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 \documentclass{article}
-\usepackage{amsmath,enumerate,xspace,bbold,amsthm,mathtools}
-\usepackage{multirow,array,theorem,amsmath,latexsym,xspace,bibentry,tikz}
-\usepackage{array,theorem,amsmath,latexsym,hhline,pifont,xspace, bibentry}
-\usepackage[round,authoryear]{natbib}
-\usepackage[TABBOTCAP]{subfigure}
-\usepackage{grffile}
-\usepackage{breakurl}
-\usepackage{epstopdf}
 
 \usepackage[pointedenum]{paralist}
 \usepackage{enumitem}
 \usepackage{times}
 \usepackage{graphicx}
-\usepackage{algorithm}
+\usepackage{subfigure}
+\usepackage{algorithm,multirow}
+\usepackage{xspace,amsmath}
+\usepackage{xr}
 \usepackage{hyperref}
+\usepackage{xspace,amsmath,multirow}
 
 \DeclareMathOperator*{\argmin}{arg\,min}
 \DeclareMathOperator*{\argmax}{arg\,max}
@@ -1287,15 +1283,15 @@ is significantly faster than the state-of-the-art primal solver
 and all existing
 distributed dual linear SVM algorithms.
 
-In experiments of \cite{CM15a},
-\disdca is significantly faster than \dsvm,
-but the difference here is not huge.
-The reason is that we consider larger data
-that are closer to those being used in real
-distributed environments.
-Also we use a larger $C$ to represent difficult problems.
+Note that the comparison between \dsvm and \disdca accords the
+results in the results in \cite{CM15a} that when smaller $\lambda$
+(equivalent to larger weight on the loss term)
+is used, the difference between the two algorithms is less significant.
+This can also be verified by the result on the \uu data set that has larger $l$ and thus
+a larger loss term with a fixed $C$,
+which is also equivalent to a $\lambda$ smaller than that being considered in \cite{CM15a}.
 Additional experiments in the supplement show that for
-smaller $C$, \disdca is superior.
+smaller $C$, the differences are significant and \disdca is superior.
 But in these cases,
 most algorithms finish training in a very short time
 and thus the setting of smaller $C$ does not provide meaningful
@@ -1352,6 +1348,7 @@ Based on this work, we have extended the package \mpi
 \url{http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/distributed-liblinear/}
 to include the proposed implementation.
 
+\pagebreak
 \section*{Acknowledgment}
 This material is based on research sponsored by DARPA under
 agreement number FA8750-13-2-0008. The U.S. Government is
@@ -1784,6 +1781,7 @@ We present results of $C=0.0001$.}
 		\label{fig:app-time}
 	\end{figure*}
 
+	\iffalse
 \section{The Equivalence of \cocoap and \disdca}
 In this section, we show that the algorithm \cocoap
 in \cite{CM15a} under the setting used in their experiments,
@@ -1935,6 +1933,7 @@ Thus, the two algorithms are identical and our experiments already included this
 \end{itemize}
 \caption{The practical variant of the \disdca algorithm.}
 \end{algorithm*}
+\fi
 
 \end{document}
 
diff --git a/doc/distcd.tex b/doc/distcd.tex
index 11e20c3..47ae00d 100644
--- a/doc/distcd.tex
+++ b/doc/distcd.tex
@@ -1275,15 +1275,15 @@ is significantly faster than the state-of-the-art primal solver
 and all existing
 distributed dual linear SVM algorithms.
 
-In experiments of \cite{CM15a},
-\disdca is significantly faster than \dsvm,
-but the difference here is not huge.
-The reason is that we consider larger data
-that are closer to those being used in real
-distributed environments.
-Also we use a larger $C$ to represent difficult problems.
+Note that the comparison between \dsvm and \disdca accords the
+results in the results in \cite{CM15a} that when smaller $\lambda$
+(equivalent to larger weight on the loss term)
+is used, the difference between the two algorithms is less significant.
+This can also be verified by the result on the \uu data set that has larger $l$ and thus
+a larger loss term with a fixed $C$,
+which is also equivalent to a $\lambda$ smaller than that being considered in \cite{CM15a}.
 Additional experiments in the supplement show that for
-smaller $C$, \disdca is superior.
+smaller $C$, the differences are significant and \disdca is superior.
 But in these cases,
 most algorithms finish training in a very short time
 and thus the setting of smaller $C$ does not provide meaningful
-- 
GitLab