moved files to a new package, and updated readme

b49d0793 · Stephen Mayhew · 7e44da72 · b49d0793 · b49d0793 · 7e44da72
Commit b49d0793 authored 9 years ago by Stephen Mayhew
--- a/README.md
+++ b/README.md
 Neural Probabilistic Language Model Toolkit -- Java
 ===================================================

-This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it is a simple port of the nplm.py python file from that distribution. The goal is to have a reader for the language model file which can be used in Java. The interesting part is the matrix multiplication to get the probability of text.
+This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it
+is a simple port of the nplm.py python file from that distribution. The goal is to have a
+reader for the language model file which can be used in Java. The interesting part is the
+matrix multiplication to get the probability of text.

 Hopefully there will be a JNI section for training a LM from Java also.

+To train a model, use all the tools available from the NPLM toolkit (not in Java). These are executables
+called `prepareNeuralNetwork`, `trainNeuralNetwork`, `testNeuralNetwork`, `testNeuralLM`, etc.

--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@

    <groupId>edu.illinois.cs.cogcomp</groupId>
    <artifactId>nplm-java</artifactId>
-    <version>1.0-SNAPSHOT</version>
+    <version>1.0.0</version>

    <repositories>
        <repository>
@@ -63,4 +63,36 @@

    </dependencies>

-</project>
\ No newline at end of file
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.0.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+        </plugins>
+
+        <extensions>
+          <extension>
+            <groupId>org.apache.maven.wagon</groupId>
+            <artifactId>wagon-ssh</artifactId>
+            <version>2.4</version>
+          </extension>
+        </extensions>
+        
+    </build>
+
+    <distributionManagement>
+      <repository>
+        <id>CogcompSoftware</id>
+        <name>CogcompSoftware</name>
+        <url>scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo</url>
+      </repository>
+    </distributionManagement>
+    <packaging>jar</packaging>
+
+</project>
--- a/src/main/java/edu/illinois/cs/cogcomp/Tester.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/Tester.java
-package edu.illinois.cs.cogcomp;
-
-import org.apache.commons.math3.linear.*;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Created by mayhew2 on 11/19/15.
- */
-public class Tester {
-
-    public static void main(String[] args) throws Exception {
-
-        NeuralLM m = NeuralLM.from_file("/shared/experiments/mayhew2/transliteration/NEURAL_LANGUAGE_MODEL/example/inferno.nnlm");
-
-        List<String> ngrams = new ArrayList<>();
-
-        ngrams.add("fair");
-        ngrams.add("and");
-        ngrams.add("xkcd");
-
-        System.out.println(m.ngram_prob(ngrams));
-
-    }
-
-}
--- a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
-package edu.illinois.cs.cogcomp;
+package edu.illinois.cs.cogcomp.lm;

 import edu.illinois.cs.cogcomp.core.io.LineIO;
 import org.apache.commons.math3.linear.*;
@@ -8,7 +8,6 @@ import org.apache.commons.math3.random.UniformRandomGenerator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.FileNotFoundException;
 import java.util.*;

 /**
@@ -113,22 +112,34 @@ public class NeuralLM {
        uniform(this.output_biases, r);
    }

+
    /**
-     * inputs is a list of one-hot vectors, size of each should be input_embedding_dimension.
-     * length of inputs is ngram-1. The first element of inputs is a matrix containing one-hot
-     * vectors of the first word in each ngram. The second element is a matrix containing one-hot vectors
-     * of the second word in each ngram. etc.
-     *
-     * output is a matrix containing one-hot vectors, where each vector is the last in each ngram.
-     * output is also a one-hot vector, size of output_embedding_dimension.
-     *
-     * @return
+     * This is primarily intended for character based n-grams. Convenient.
+     * @param chars needs to have length of ngram_size.
+     * @return log probability of ngram.
+     * @throws Exception
+     */
+    public double ngram_prob(char[] chars) throws Exception {
+        List<String> lst = new ArrayList<>();
+        for(char c : chars){
+            lst.add(c + "");
+        }
+        return ngram_prob(lst);
+    }
+
+    /**
+     * get the probability of a single ngram
+     * @param ngramlist this is a list of items in the ngram. This needs to be of
+     *                  size ngram_size. For example, {'john', 'ate', 'cabbage'}
+     *                  for ngram_size=3
+     * @return the log probability of this ngram
+     * @throws Exception need to fix this.
     */
    public double ngram_prob(List<String> ngramlist) throws Exception {

        // the input here is a single ngram. Length of ngram
        if(ngramlist.size() != ngram_size){
-            throw new Exception("ngmram-list must have the same size as ngram_size");
+            throw new Exception("ngram-list must have the same size as ngram_size");
        }

        int unkInd = this.word_to_index.get("<unk>");
@@ -153,6 +164,7 @@ public class NeuralLM {
            }
            RealVector r = new ArrayRealVector(this.n_input_vocab);
            r.setEntry(ind, 1);
+            // FIXME: this could be changed to a index, not a multiplication
            RealVector r_embed = this.input_embeddings.transpose().operate(r);
            concatenated = concatenated.append(r_embed);
        }
@@ -347,7 +359,6 @@ public class NeuralLM {
            }else if(section.equals("\\output_vocab")){
                // FIXME: not sure there is anything I need to do???
            } else if(section.equals("\\input_embeddings")) {
-                logger.debug("got here");
                m.input_embeddings = read_matrix(sectionlines, m.n_input_vocab, m.input_embedding_dimension);
            }else if(section.equals("\\hidden_weights 1")) {
                m.hidden1_weights = read_matrix(sectionlines, m.n_hidden, (m.ngram_size - 1) * m.input_embedding_dimension);

--- a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
-package edu.illinois.cs.cogcomp;
+package edu.illinois.cs.cogcomp.lm;

 /**
 * Created by mayhew2 on 11/18/15.

--- a/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java
+package edu.illinois.cs.cogcomp.lm;
+
+import edu.illinois.cs.cogcomp.core.io.LineIO;
+import edu.illinois.cs.cogcomp.lm.NeuralLM;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by mayhew2 on 11/19/15.
+ */
+public class Tester {
+
+    public static void main(String[] args) throws Exception {
+
+        NeuralLM m = NeuralLM.from_file("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/nplm-ru.txt");
+
+//        List<String> ngrams = new ArrayList<>();
+
+        //ngrams.add("т");
+        //ngrams.add("и");
+        //ngrams.add("н");
+
+//        ngrams.add("a");
+//        ngrams.add("b");
+//        ngrams.add("н");
+
+        List<String> lines = LineIO.read("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/ruwords.txt");
+
+        double total = 0;
+
+        List<String> ngrams = new ArrayList<>();
+
+        for(String line : lines){
+            line = "<s> " + line;
+            String[] sline = line.split(" ");
+
+            for(int i = 0; i < sline.length-1; i++){
+                ngrams.clear();
+
+                ngrams.add(sline[i]);
+                ngrams.add(sline[i+1]);
+
+
+                total += m.ngram_prob(ngrams);
+            }
+
+
+        }
+        System.out.println(total);
+
+        System.out.println();
+        String russian = "хрусталь";
+
+        for(char c : russian.toCharArray()){
+            ngrams.clear();
+            ngrams.add(c + "");
+            ngrams.add("</s>");
+
+            System.out.println(m.ngram_prob(ngrams));
+        }
+
+
+
+
+    }
+
+}