From b49d0793d4ea01a3a18bd596b9cde16c77bb7a97 Mon Sep 17 00:00:00 2001
From: Stephen Mayhew <mayhew2@illinois.edu>
Date: Mon, 4 Jan 2016 12:17:05 -0600
Subject: [PATCH] moved files to a new package, and updated readme

---
 README.md                                     |  7 +-
 pom.xml                                       | 36 +++++++++-
 .../java/edu/illinois/cs/cogcomp/Tester.java  | 27 --------
 .../cs/cogcomp/{ => lm}/NeuralLM.java         | 37 ++++++----
 .../illinois/cs/cogcomp/{ => lm}/NplmJni.java |  2 +-
 .../edu/illinois/cs/cogcomp/lm/Tester.java    | 68 +++++++++++++++++++
 6 files changed, 133 insertions(+), 44 deletions(-)
 delete mode 100644 src/main/java/edu/illinois/cs/cogcomp/Tester.java
 rename src/main/java/edu/illinois/cs/cogcomp/{ => lm}/NeuralLM.java (93%)
 rename src/main/java/edu/illinois/cs/cogcomp/{ => lm}/NplmJni.java (87%)
 create mode 100644 src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java

diff --git a/README.md b/README.md
index 81bfa58..86eafd9 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,13 @@
 Neural Probabilistic Language Model Toolkit -- Java
 ===================================================
 
-This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it is a simple port of the nplm.py python file from that distribution. The goal is to have a reader for the language model file which can be used in Java. The interesting part is the matrix multiplication to get the probability of text.
+This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it
+is a simple port of the nplm.py python file from that distribution. The goal is to have a
+reader for the language model file which can be used in Java. The interesting part is the
+matrix multiplication to get the probability of text.
 
 Hopefully there will be a JNI section for training a LM from Java also.
 
+To train a model, use all the tools available from the NPLM toolkit (not in Java). These are executables
+called `prepareNeuralNetwork`, `trainNeuralNetwork`, `testNeuralNetwork`, `testNeuralLM`, etc.
 
diff --git a/pom.xml b/pom.xml
index 925ce24..1e0ab8d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>edu.illinois.cs.cogcomp</groupId>
     <artifactId>nplm-java</artifactId>
-    <version>1.0-SNAPSHOT</version>
+    <version>1.0.0</version>
 
     <repositories>
         <repository>
@@ -63,4 +63,36 @@
 
     </dependencies>
 
-</project>
\ No newline at end of file
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.0.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+        </plugins>
+
+        <extensions>
+          <extension>
+            <groupId>org.apache.maven.wagon</groupId>
+            <artifactId>wagon-ssh</artifactId>
+            <version>2.4</version>
+          </extension>
+        </extensions>
+        
+    </build>
+
+    <distributionManagement>
+      <repository>
+        <id>CogcompSoftware</id>
+        <name>CogcompSoftware</name>
+        <url>scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo</url>
+      </repository>
+    </distributionManagement>
+    <packaging>jar</packaging>
+
+</project>
diff --git a/src/main/java/edu/illinois/cs/cogcomp/Tester.java b/src/main/java/edu/illinois/cs/cogcomp/Tester.java
deleted file mode 100644
index 7146d2e..0000000
--- a/src/main/java/edu/illinois/cs/cogcomp/Tester.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package edu.illinois.cs.cogcomp;
-
-import org.apache.commons.math3.linear.*;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Created by mayhew2 on 11/19/15.
- */
-public class Tester {
-
-    public static void main(String[] args) throws Exception {
-
-        NeuralLM m = NeuralLM.from_file("/shared/experiments/mayhew2/transliteration/NEURAL_LANGUAGE_MODEL/example/inferno.nnlm");
-
-        List<String> ngrams = new ArrayList<>();
-
-        ngrams.add("fair");
-        ngrams.add("and");
-        ngrams.add("xkcd");
-
-        System.out.println(m.ngram_prob(ngrams));
-
-    }
-
-}
diff --git a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java b/src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java
similarity index 93%
rename from src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
rename to src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java
index dcc001d..16bf989 100644
--- a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java
@@ -1,4 +1,4 @@
-package edu.illinois.cs.cogcomp;
+package edu.illinois.cs.cogcomp.lm;
 
 import edu.illinois.cs.cogcomp.core.io.LineIO;
 import org.apache.commons.math3.linear.*;
@@ -8,7 +8,6 @@ import org.apache.commons.math3.random.UniformRandomGenerator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.FileNotFoundException;
 import java.util.*;
 
 /**
@@ -113,22 +112,34 @@ public class NeuralLM {
         uniform(this.output_biases, r);
     }
 
+
     /**
-     * inputs is a list of one-hot vectors, size of each should be input_embedding_dimension.
-     * length of inputs is ngram-1. The first element of inputs is a matrix containing one-hot
-     * vectors of the first word in each ngram. The second element is a matrix containing one-hot vectors
-     * of the second word in each ngram. etc.
-     *
-     * output is a matrix containing one-hot vectors, where each vector is the last in each ngram.
-     * output is also a one-hot vector, size of output_embedding_dimension.
-     *
-     * @return
+     * This is primarily intended for character based n-grams. Convenient.
+     * @param chars needs to have length of ngram_size.
+     * @return log probability of ngram.
+     * @throws Exception
+     */
+    public double ngram_prob(char[] chars) throws Exception {
+        List<String> lst = new ArrayList<>();
+        for(char c : chars){
+            lst.add(c + "");
+        }
+        return ngram_prob(lst);
+    }
+
+    /**
+     * get the probability of a single ngram
+     * @param ngramlist this is a list of items in the ngram. This needs to be of
+     *                  size ngram_size. For example, {'john', 'ate', 'cabbage'}
+     *                  for ngram_size=3
+     * @return the log probability of this ngram
+     * @throws Exception need to fix this.
      */
     public double ngram_prob(List<String> ngramlist) throws Exception {
 
         // the input here is a single ngram. Length of ngram
         if(ngramlist.size() != ngram_size){
-            throw new Exception("ngmram-list must have the same size as ngram_size");
+            throw new Exception("ngram-list must have the same size as ngram_size");
         }
 
         int unkInd = this.word_to_index.get("<unk>");
@@ -153,6 +164,7 @@ public class NeuralLM {
             }
             RealVector r = new ArrayRealVector(this.n_input_vocab);
             r.setEntry(ind, 1);
+            // FIXME: this could be changed to a index, not a multiplication
             RealVector r_embed = this.input_embeddings.transpose().operate(r);
             concatenated = concatenated.append(r_embed);
         }
@@ -347,7 +359,6 @@ public class NeuralLM {
             }else if(section.equals("\\output_vocab")){
                 // FIXME: not sure there is anything I need to do???
             } else if(section.equals("\\input_embeddings")) {
-                logger.debug("got here");
                 m.input_embeddings = read_matrix(sectionlines, m.n_input_vocab, m.input_embedding_dimension);
             }else if(section.equals("\\hidden_weights 1")) {
                 m.hidden1_weights = read_matrix(sectionlines, m.n_hidden, (m.ngram_size - 1) * m.input_embedding_dimension);
diff --git a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java b/src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java
similarity index 87%
rename from src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
rename to src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java
index ef4d07c..0e8654b 100644
--- a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java
@@ -1,4 +1,4 @@
-package edu.illinois.cs.cogcomp;
+package edu.illinois.cs.cogcomp.lm;
 
 /**
  * Created by mayhew2 on 11/18/15.
diff --git a/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java b/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java
new file mode 100644
index 0000000..830671d
--- /dev/null
+++ b/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java
@@ -0,0 +1,68 @@
+package edu.illinois.cs.cogcomp.lm;
+
+import edu.illinois.cs.cogcomp.core.io.LineIO;
+import edu.illinois.cs.cogcomp.lm.NeuralLM;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by mayhew2 on 11/19/15.
+ */
+public class Tester {
+
+    public static void main(String[] args) throws Exception {
+
+        NeuralLM m = NeuralLM.from_file("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/nplm-ru.txt");
+
+//        List<String> ngrams = new ArrayList<>();
+
+        //ngrams.add("Ñ‚");
+        //ngrams.add("Ð¸");
+        //ngrams.add("Ð½");
+
+//        ngrams.add("a");
+//        ngrams.add("b");
+//        ngrams.add("Ð½");
+
+        List<String> lines = LineIO.read("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/ruwords.txt");
+
+        double total = 0;
+
+        List<String> ngrams = new ArrayList<>();
+
+        for(String line : lines){
+            line = "<s> " + line;
+            String[] sline = line.split(" ");
+
+            for(int i = 0; i < sline.length-1; i++){
+                ngrams.clear();
+
+                ngrams.add(sline[i]);
+                ngrams.add(sline[i+1]);
+
+
+                total += m.ngram_prob(ngrams);
+            }
+
+
+        }
+        System.out.println(total);
+
+        System.out.println();
+        String russian = "Ñ…Ñ€ÑƒÑÑ‚Ð°Ð»ÑŒ";
+
+        for(char c : russian.toCharArray()){
+            ngrams.clear();
+            ngrams.add(c + "");
+            ngrams.add("</s>");
+
+            System.out.println(m.ngram_prob(ngrams));
+        }
+
+
+
+
+    }
+
+}
-- 
GitLab