From a052269f39d29c5cb7728e6b59a7293fa6da33eb Mon Sep 17 00:00:00 2001 From: leepei <leepei> Date: Mon, 13 Jul 2015 03:00:33 -0700 Subject: [PATCH] update exp descriptions --- doc/Makefile | 2 +- doc/disdca-cocoap.tex | 377 ----------------------------------------- doc/distcd-cogcomp.tex | 33 ++-- doc/distcd.tex | 16 +- 4 files changed, 25 insertions(+), 403 deletions(-) delete mode 100644 doc/disdca-cocoap.tex diff --git a/doc/Makefile b/doc/Makefile index ccdadba..6326c44 100755 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,4 +1,4 @@ -FILES = distcd.pdf supple.pdf distcd-cogcomp.pdf disdca-cocoap.pdf +FILES = distcd.pdf supple.pdf distcd-cogcomp.pdf TEX_DEP = *.tex *.bib .SUFFIXES: .tex .in .dvi .ps .pdf .PHONY: all diff --git a/doc/disdca-cocoap.tex b/doc/disdca-cocoap.tex deleted file mode 100644 index e18429a..0000000 --- a/doc/disdca-cocoap.tex +++ /dev/null @@ -1,377 +0,0 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%% ICML 2015 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\documentclass{article} -\pdfoutput=1 - -\usepackage[pointedenum]{paralist} -\usepackage{enumitem,listings,color,verbatim} -\usepackage{times} -\usepackage{graphicx} -\usepackage{subfigure} -\usepackage{algorithm} -\usepackage{xspace,amsmath} -\usepackage{xr} -\usepackage{hyperref} - -\DeclareMathOperator*{\argmax}{arg\,max} -\newcommand{\bqoe}{$\mbox{{\sf BQO-E}}$\xspace} -\def\R{{ \mathbf{R}}} -\def\N{{ \mathbf{N}}} -\def\P{{ \mathcal{P}}} -\definecolor{dkgreen}{rgb}{0,0.6,0} -\definecolor{listinggray}{gray}{0.9} -\definecolor{lbcolor}{rgb}{0.9,0.9,0.9} -\def\bx{{\boldsymbol x}} -\def\bu{{\boldsymbol u}} -\def\bw{{\boldsymbol w}} -\def\bv{{\boldsymbol v}} -\def\be{{\boldsymbol e}} -\def\bd{{\boldsymbol d}} -\def\bzero{{\boldsymbol 0}} -\def\AL{{\boldsymbol{\alpha}}} -\def\webspam{{\sf webspam}\xspace} -\def\eps{{\sf epsilon}\xspace} -\def\uu{{\sf url}\xspace} -\def\tron{{\sf TRON}\xspace} -\newcommand{\disdca}{$\mbox{{\sf DisDCA}}$\xspace} -\newcommand{\cocoa}{$\mbox{{\sf CoCoA}}$\xspace} -\newcommand{\mpi}{$\mbox{{\sf MPI-LIBLINEAR}}$\xspace} -\newcommand{\dsvm}{$\mbox{{\sf DSVM-AVE}}$\xspace} -\newcommand{\svmls}{$\mbox{{\sf DSVM-LS}}$\xspace} -\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}} % end of proof -\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]} -\newtheorem{theorem}{Theorem}[section] -\newtheorem{lemma}[theorem]{Lemma} -\newtheorem{corollary}[theorem]{Corollary} -\newtheorem{innercustomthm}{Theorem} -\newenvironment{customthm}[1] -{\renewcommand\theinnercustomthm{#1}\innercustomthm} -{\endinnercustomthm} -% Employ this version of the ``usepackage'' statement after the paper has -% been accepted, when creating the final version. This will set the -% note in the first column to ``Proceedings of the...'' -\usepackage[accepted]{icml2015} - -\newcommand{\cocoap}{$\mbox{{\sf CoCoA+}}$\xspace} -\newcommand{\birds}{$\mbox{{\sf Birds}}$\xspace} - -\icmltitlerunning{On the Equivalence of \cocoap and \disdca} - -\begin{document} -\lstdefinelanguage{scala}{ - morekeywords={% - abstract,case,catch,class,def,do,else,extends,% - false,final,finally,for,forSome,if,implicit,import,lazy,% - match,new,null,object,override,package,private,protected,% - return,sealed,super,this,throw,trait,true,try,type,% - val,var,while,with,yield}, - otherkeywords={=>,<-,<\%,<:,>:,\#,@}, - sensitive=true, - morecomment=[l]{//}, - morecomment=[n]{/*}{*/}, - morestring=[b]", - morestring=[b]', - morestring=[b]""" - }[keywords,comments,strings] - - % activate the language and predefine settings -% \lstset{language=Scala} - - -\twocolumn[ -\icmltitle{On the Equivalence of \cocoap and \disdca} -\icmlauthor{Ching-pei Lee}{clee149@illinois.edu} -\icmladdress{University of Illinois at Urbana-Champaign, -201 N. Goodwin Avenue, Urbana, IL 61801 USA} -] - -In this document, we show that the algorithm \cocoap -\citep{CM15a} under the setting used in their experiments, -which is also the best setting suggested by the -authors that proposed this algorithm, -is equivalent to the practical variant of \disdca -\citep{TBY13a}. - -\section{Notations} -Consider the problems being solved, we first unify the -notation in the two papers. -Given training instances $\{(x_i,y_i)\}_{i=1}^n$, -The problem being solved are Problem (2) in \cite{CM15a}: -\begin{equation} - \max_{\alpha \in \R^n} -\frac{1}{n}\sum_{j=1}^n - l^*_{j}\left(-\alpha_j\right) - \frac{\lambda}{2} - \|\frac{1}{\lambda n} A\alpha\|^2, - \label{eq:dual} -\end{equation} - where $A = [x_1,x_2,\ldots,x_n]$, -and Problem (2) in \cite{TBY13a} -\begin{equation*} - \max_{\alpha \in \R^n} \frac{1}{n}\sum_{i=1}^n - -\phi^*_{i}\left(-\alpha_i\right) - \lambda g^*\left( - \frac{1}{\lambda n} \sum_{i=1}^n \alpha_i x_i\right). -\end{equation*} -By considering the special case $g^*(\cdot) = \frac{1}{2}\| -\cdot \|^2$ and noting that $\phi^*_i$ and $l^*_i$ are just -different notations for the same function, -the two problems are equivalent. -In the following analysis, we use the formulation of \eqref{eq:dual}. - -\section{Algorithms} -\label{sec:alg} -Assume there are $K$ machines disjointly storing the -training instances, -the index set of the instances in machine $k$ is defined as -$P_k$, and for any vector $\bv \in \R^n$, define the vector -$\bv_{[k]} \in \R^n$ by -\begin{equation*} - \left(\bv_{[k]}\right)_i = \begin{cases} - 0, &\text{ if } i \notin P_k,\\ - \bv_i, &\text{ if } i \in P_k. - \end{cases} -\end{equation*} -We list the \cocoap algorithm under the best -setting ($\sigma' = K, \gamma = 1$) suggested by the -authors of \cite{CM15a} in Algorithm \ref{alg:cocoap}, -and list the algorithm of the practical variant of \disdca -in Algorithm \ref{alg:disdca}. -Note that for \disdca with $g(\bw) = \frac{1}{2}\bw^T \bw$, -we have $\nabla g(\bw) = \bw$. - -Clearly, let -\begin{equation*} - \Delta \alpha = \delta,\quad \bu = - \bw^t + \frac{K}{\lambda n}A \Delta \alpha_{[k]}, -\end{equation*} -and observe that $\AL$ in Algorithm \ref{alg:disdca} is equivalent to -$\AL + \AL_{[k]}$ in Algorithm \ref{alg:cocoap} by their update rules in line 1.2.3 of both algorithms, -\eqref{eq:cocoap} and \eqref{eq:disdca} are equivalent problems. -Note that line 1.2.4 in Algorithm \ref{alg:disdca} indeed -ensures $\bu = \bw^t + \frac{K}{\lambda n}A \Delta \alpha_{[k]}$. -Thus, the two algorithms are identical. -\begin{algorithm*} - \label{alg:cocoap} - \begin{itemize} - \item Input: number of iterations $T$, number of inner iterations $H$ for the local SDCA solver. - \item Let $\AL = \bzero$, $\bw^0 = \bzero$: - \item For $t=1,2,\ldots,T$: - \begin{enumerate} - \item Run the following process on the $K$ machines in parallel: - \begin{enumerate} - \item Let $\Delta \AL = \bzero$. - \item For $h=1,\ldots,H:$ - \begin{enumerate} - \item Pick $i \in P_k$ uniformly at random. - \item Solve - \end{enumerate} - \begin{align} - \delta_i^* - &= \argmax_{\delta \in \R}\quad - - l_i^*\left(-\left(\alpha_i + \Delta \alpha_i\right) - \delta\right) - -\left(\bw^{t}\right)^T A \left(\Delta \alpha_{[k]} + \delta e_i\right) - - \frac{\lambda nK}{2}\| \frac{1}{\lambda n} A\left(\Delta \alpha_{[k]} + \delta e_i\right)\|^2\nonumber\\ - &= \argmax_{\delta \in \R}\quad - - l_i^*\left(-\left(\alpha_i + \Delta \alpha_i \right) - \delta\right) - -\left(\bw^{t} + \frac{K}{\lambda n}A\Delta \alpha_{[k]}\right)^T x_i \delta - - \frac{K}{2 \lambda n}\|x_i\|^2 \delta^2 - \label{eq:cocoap} - \end{align} - \begin{enumerate} - \item[1.2.3.] $\Delta \alpha_{[k]} \leftarrow \Delta \alpha_{[k]} + \delta_i^* e_i$. - - \end{enumerate} -\item Update $\alpha_{[k]} = \alpha_{[k]} + \Delta \alpha_{[k]}$ - \end{enumerate} -\item Obtain $\bw^{t+1} = \bw^t + \sum_{k=1}^K \frac{1}{\lambda n} A \Delta \AL_{[k]}$ on all machines - \end{enumerate} -\end{itemize} - \caption{The \cocoap algorithm, under the setting suggested by the authors and also the setting used in the experiments of \cite{CM15a}.} -\end{algorithm*} - -\begin{algorithm*} - \label{alg:disdca} - \begin{itemize} - \item Input: number of iterations $T$, number of inner iterations $H$ for the local SDCA solver. - \item Let $\AL = \bzero$, $\bw^0 = \bzero$: - \item For $t=1,2,\ldots,T$: - \begin{enumerate} - \item Run the following process on the $K$ machines in parallel: - \begin{enumerate} - \item Let $\bu = \bw^t$. - \item For $h=1,\ldots,H:$ - \begin{enumerate} - \item Pick $i \in P_k$ uniformly at random. - \item Solve - \end{enumerate} - \begin{equation} - \Delta \AL_i - = \argmax_{\Delta \alpha \in \R}\quad - - l_i^*\left(-\alpha_i - \Delta \alpha\right) - -\bu^T x_i \Delta \alpha - - \frac{K}{2 \lambda n}\|x_i\|^2 \left(\Delta \alpha\right)^2 - \label{eq:disdca} - \end{equation} - \begin{enumerate} - \item[1.2.3.] $\alpha_i = \alpha_i + \Delta \alpha$. - \item[1.2.4.] $\bu = \bu + \frac{K}{\lambda n}\Delta x_i\alpha $. - - \end{enumerate} - \end{enumerate} -\item Obtain $\bw^{t+1} = \bw^t + \sum_{k=1}^K \frac{1}{\lambda n} A \Delta \AL_{[k]}$ on all machines - \end{enumerate} -\end{itemize} -\caption{The practical variant of the \disdca algorithm.} -\end{algorithm*} - -\section{Implementation Comparison} -Here we compare the codes \cocoap and \birds. -\cocoap is the code released by the authors of \cite{CM15a} implementing their algorithm in Apache Spark. -As indicated in \cite{CM15a}, it is available in \url{http://github.com/gingsmith/cocoa/}. -\birds is the code released by the author of \cite{TBY13a} implementing their practical variant of \disdca proposed in that work using C++ and MPI. -It is available at \url{http://homepage.cs.uiowa.edu/~tyng/software.html}. - -We excerpt the core part of the codes solving the local sub-problems to verify our argument in Section \ref{sec:alg}. -Figure \ref{fig:cocoa} shows lines 171-201 of the file CoCoA.scala in \cocoap. -Figure \ref{fig:disdca} shows lines 81-90 of the file inc\_dual.cc in \birds. - -Note that in Figure \ref{fig:cocoa}, the variable {\em plus} in \cocoap is true, and {\em sigma} is $K$ as suggested in the paper. -In Figure \ref{fig:disdca}, the variable {\em coeff} is $K/(\lambda n)$, -and the variable {\em mQ} is the value of $\bx_i^T \bx_i$. - -In the beginning, {\em deltaW} in line 174 of CoCoA.scala is $0$, so the value of -{\em grad} after line 174 -and the -{\em 1 - prediction} part in line 84 of inc\_dual.cc has the following relationship. -\begin{equation} - \text{grad} = -\lambda n (1 - \text{prediction}) - \label{eq:cocoa-disdca} -\end{equation} -Lines 181-187 of CoCoA.scala and line 85 of inc\_dual.cc are both projecting the variable back to a feasible region though the details are different. - -From Line 188 of CoCoA.scala, we see that {\em qii} is $K$ times of {\em Data.mQ(j)} in line 84 of inc\_dual.cc. -Combining these factors, -we have that -$- ($grad $/$ qii$)$ -in line 191 of CoCoA.scala is the same as the first term of line 84 in inc\_dual.cc. -\begin{align*} - -\text{grad}/\text{qii} &= -\frac{\left(y_i \bw^T \bx_i - 1\right) \lambda n}{ \bx_i^T \bx_i K} - = \frac{1 - y_i \bw^T \bx_i}{\bx_i^T \bx_i \cdot \frac{K}{\lambda n}}\\ - &= \left(1 - \text{prediction}\right) / \left(\text{Data.mQ(j) * coeff}\right) -\end{align*} -Then line 199 in Figure \ref{fig:cocoa} and lines 87-88 in Figure \ref{fig:disdca} update the primal variables. -Here a difference occurs. -The update of WA in inc\_dual.cc is $K$ times larger than the update of deltaW in CoCoA.scala. -Thus, in the next round of inc\_dual.cc, -if we denote $\text{WA}_0$ as the value of WA in the previous round, -and denote $K * \text{deltaW}$ as the vector being added to $\text{WA}_0$, -we have that -\begin{equation*} - Data.Xw\left(WA,j\right) = \left(\text{WA}_0 + K \text{deltaW}\right)^T \bx_j -\end{equation*} -which is exactly the same computation as line 174 of CoCoA.scala. -Thus \eqref{eq:cocoa-disdca} still holds. -Therefore, the algorithms behind the two implementations are identical. - - -\begin{figure*} - \lstset{ - backgroundcolor=\color{lbcolor}, - tabsize=4, - language=scala, - captionpos=b, - tabsize=3, - frame=lines, - numbers=left, - numberstyle=\tiny, - numbersep=5pt, - firstnumber=171, - breaklines=true, - showstringspaces=false, - basicstyle=\footnotesize, - % identifierstyle=\color{magenta}, - keywordstyle=\color[rgb]{0,0,1}, - commentstyle=\color{dkgreen}, - stringstyle=\color{red} - } - \begin{lstlisting} - // compute hinge loss gradient - val grad = { - if (plus) { - (y*(x.dot(w)+sigma*x.dot(deltaW)) - 1.0)*(lambda*n) - } else { - (y*(x.dot(w)) - 1.0)*(lambda*n) - } - } - - // compute projected gradient - var proj_grad = grad - if (alpha(idx) <= 0.0) - proj_grad = Math.min(grad,0) - else if (alpha(idx) >= 1.0) - proj_grad = Math.max(grad,0) - - if (Math.abs(proj_grad) != 0.0 ) { - val qii = if (plus) x.dot(x)*sigma else x.dot(x) - var newAlpha = 1.0 - if (qii != 0.0) { - newAlpha = Math.min(Math.max((alpha(idx) - (grad / qii)), 0.0), 1.0) - } - - // update primal and dual variables - val update = x.times( y*(newAlpha-alpha(idx))/(lambda*n) ) - if (!plus) { - w = update.plus(w) - } - deltaW = update.plus(deltaW) - alpha(idx) = newAlpha - } - - \end{lstlisting} - - \caption{CoCoA.scala in \cocoap} - \label{fig:cocoa} -\end{figure*} -\begin{figure*} - \lstset{ - backgroundcolor=\color{lbcolor}, - tabsize=4, - language=C++, - captionpos=b, - tabsize=3, - frame=lines, - numbers=left, - numberstyle=\tiny, - numbersep=5pt, - firstnumber=81, - breaklines=true, - showstringspaces=false, - basicstyle=\footnotesize, - % identifierstyle=\color{magenta}, - keywordstyle=\color[rgb]{0,0,1}, - commentstyle=\color{dkgreen}, - stringstyle=\color{red} - } - \begin{lstlisting} - int is_class = (Data.y<int>(j)==myclass)?1:-1; - - double prediction = is_class*(Data.Xw(WA,j)); - double v = (1 - prediction)/(Data.mQ(j)*coeff) + Alpha[j]; - v=std::max(std::min(v,1.0), 0.0); - double del_alpha=v- Alpha[j]; - if(Data.fmt()=="dense") WA.add(Data.dX(j), del_alpha*is_class*coeff); - else WA.add(Data.sX(j), del_alpha*is_class*coeff); - phi[0] += del_alpha; - Alpha[j]=v; - \end{lstlisting} - - \caption{inc\_dual.cc in \birds} - \label{fig:disdca} -\end{figure*} - - -\bibliography{tmp} -\bibliographystyle{icml2015} - -\end{document} - - diff --git a/doc/distcd-cogcomp.tex b/doc/distcd-cogcomp.tex index e8a4230..f4a8258 100644 --- a/doc/distcd-cogcomp.tex +++ b/doc/distcd-cogcomp.tex @@ -3,21 +3,17 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \documentclass{article} -\usepackage{amsmath,enumerate,xspace,bbold,amsthm,mathtools} -\usepackage{multirow,array,theorem,amsmath,latexsym,xspace,bibentry,tikz} -\usepackage{array,theorem,amsmath,latexsym,hhline,pifont,xspace, bibentry} -\usepackage[round,authoryear]{natbib} -\usepackage[TABBOTCAP]{subfigure} -\usepackage{grffile} -\usepackage{breakurl} -\usepackage{epstopdf} \usepackage[pointedenum]{paralist} \usepackage{enumitem} \usepackage{times} \usepackage{graphicx} -\usepackage{algorithm} +\usepackage{subfigure} +\usepackage{algorithm,multirow} +\usepackage{xspace,amsmath} +\usepackage{xr} \usepackage{hyperref} +\usepackage{xspace,amsmath,multirow} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\argmax}{arg\,max} @@ -1287,15 +1283,15 @@ is significantly faster than the state-of-the-art primal solver and all existing distributed dual linear SVM algorithms. -In experiments of \cite{CM15a}, -\disdca is significantly faster than \dsvm, -but the difference here is not huge. -The reason is that we consider larger data -that are closer to those being used in real -distributed environments. -Also we use a larger $C$ to represent difficult problems. +Note that the comparison between \dsvm and \disdca accords the +results in the results in \cite{CM15a} that when smaller $\lambda$ +(equivalent to larger weight on the loss term) +is used, the difference between the two algorithms is less significant. +This can also be verified by the result on the \uu data set that has larger $l$ and thus +a larger loss term with a fixed $C$, +which is also equivalent to a $\lambda$ smaller than that being considered in \cite{CM15a}. Additional experiments in the supplement show that for -smaller $C$, \disdca is superior. +smaller $C$, the differences are significant and \disdca is superior. But in these cases, most algorithms finish training in a very short time and thus the setting of smaller $C$ does not provide meaningful @@ -1352,6 +1348,7 @@ Based on this work, we have extended the package \mpi \url{http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/distributed-liblinear/} to include the proposed implementation. +\pagebreak \section*{Acknowledgment} This material is based on research sponsored by DARPA under agreement number FA8750-13-2-0008. The U.S. Government is @@ -1784,6 +1781,7 @@ We present results of $C=0.0001$.} \label{fig:app-time} \end{figure*} + \iffalse \section{The Equivalence of \cocoap and \disdca} In this section, we show that the algorithm \cocoap in \cite{CM15a} under the setting used in their experiments, @@ -1935,6 +1933,7 @@ Thus, the two algorithms are identical and our experiments already included this \end{itemize} \caption{The practical variant of the \disdca algorithm.} \end{algorithm*} +\fi \end{document} diff --git a/doc/distcd.tex b/doc/distcd.tex index 11e20c3..47ae00d 100644 --- a/doc/distcd.tex +++ b/doc/distcd.tex @@ -1275,15 +1275,15 @@ is significantly faster than the state-of-the-art primal solver and all existing distributed dual linear SVM algorithms. -In experiments of \cite{CM15a}, -\disdca is significantly faster than \dsvm, -but the difference here is not huge. -The reason is that we consider larger data -that are closer to those being used in real -distributed environments. -Also we use a larger $C$ to represent difficult problems. +Note that the comparison between \dsvm and \disdca accords the +results in the results in \cite{CM15a} that when smaller $\lambda$ +(equivalent to larger weight on the loss term) +is used, the difference between the two algorithms is less significant. +This can also be verified by the result on the \uu data set that has larger $l$ and thus +a larger loss term with a fixed $C$, +which is also equivalent to a $\lambda$ smaller than that being considered in \cite{CM15a}. Additional experiments in the supplement show that for -smaller $C$, \disdca is superior. +smaller $C$, the differences are significant and \disdca is superior. But in these cases, most algorithms finish training in a very short time and thus the setting of smaller $C$ does not provide meaningful -- GitLab