From b49d0793d4ea01a3a18bd596b9cde16c77bb7a97 Mon Sep 17 00:00:00 2001 From: Stephen Mayhew <mayhew2@illinois.edu> Date: Mon, 4 Jan 2016 12:17:05 -0600 Subject: [PATCH] moved files to a new package, and updated readme --- README.md | 7 +- pom.xml | 36 +++++++++- .../java/edu/illinois/cs/cogcomp/Tester.java | 27 -------- .../cs/cogcomp/{ => lm}/NeuralLM.java | 37 ++++++---- .../illinois/cs/cogcomp/{ => lm}/NplmJni.java | 2 +- .../edu/illinois/cs/cogcomp/lm/Tester.java | 68 +++++++++++++++++++ 6 files changed, 133 insertions(+), 44 deletions(-) delete mode 100644 src/main/java/edu/illinois/cs/cogcomp/Tester.java rename src/main/java/edu/illinois/cs/cogcomp/{ => lm}/NeuralLM.java (93%) rename src/main/java/edu/illinois/cs/cogcomp/{ => lm}/NplmJni.java (87%) create mode 100644 src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java diff --git a/README.md b/README.md index 81bfa58..86eafd9 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,13 @@ Neural Probabilistic Language Model Toolkit -- Java =================================================== -This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it is a simple port of the nplm.py python file from that distribution. The goal is to have a reader for the language model file which can be used in Java. The interesting part is the matrix multiplication to get the probability of text. +This is based on the [NPLM toolkit](http://nlg.isi.edu/software/nplm/) from ISI. In fact, it +is a simple port of the nplm.py python file from that distribution. The goal is to have a +reader for the language model file which can be used in Java. The interesting part is the +matrix multiplication to get the probability of text. Hopefully there will be a JNI section for training a LM from Java also. +To train a model, use all the tools available from the NPLM toolkit (not in Java). These are executables +called `prepareNeuralNetwork`, `trainNeuralNetwork`, `testNeuralNetwork`, `testNeuralLM`, etc. diff --git a/pom.xml b/pom.xml index 925ce24..1e0ab8d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ <groupId>edu.illinois.cs.cogcomp</groupId> <artifactId>nplm-java</artifactId> - <version>1.0-SNAPSHOT</version> + <version>1.0.0</version> <repositories> <repository> @@ -63,4 +63,36 @@ </dependencies> -</project> \ No newline at end of file + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.0.2</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + </plugins> + + <extensions> + <extension> + <groupId>org.apache.maven.wagon</groupId> + <artifactId>wagon-ssh</artifactId> + <version>2.4</version> + </extension> + </extensions> + + </build> + + <distributionManagement> + <repository> + <id>CogcompSoftware</id> + <name>CogcompSoftware</name> + <url>scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo</url> + </repository> + </distributionManagement> + <packaging>jar</packaging> + +</project> diff --git a/src/main/java/edu/illinois/cs/cogcomp/Tester.java b/src/main/java/edu/illinois/cs/cogcomp/Tester.java deleted file mode 100644 index 7146d2e..0000000 --- a/src/main/java/edu/illinois/cs/cogcomp/Tester.java +++ /dev/null @@ -1,27 +0,0 @@ -package edu.illinois.cs.cogcomp; - -import org.apache.commons.math3.linear.*; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created by mayhew2 on 11/19/15. - */ -public class Tester { - - public static void main(String[] args) throws Exception { - - NeuralLM m = NeuralLM.from_file("/shared/experiments/mayhew2/transliteration/NEURAL_LANGUAGE_MODEL/example/inferno.nnlm"); - - List<String> ngrams = new ArrayList<>(); - - ngrams.add("fair"); - ngrams.add("and"); - ngrams.add("xkcd"); - - System.out.println(m.ngram_prob(ngrams)); - - } - -} diff --git a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java b/src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java similarity index 93% rename from src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java rename to src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java index dcc001d..16bf989 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java +++ b/src/main/java/edu/illinois/cs/cogcomp/lm/NeuralLM.java @@ -1,4 +1,4 @@ -package edu.illinois.cs.cogcomp; +package edu.illinois.cs.cogcomp.lm; import edu.illinois.cs.cogcomp.core.io.LineIO; import org.apache.commons.math3.linear.*; @@ -8,7 +8,6 @@ import org.apache.commons.math3.random.UniformRandomGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.util.*; /** @@ -113,22 +112,34 @@ public class NeuralLM { uniform(this.output_biases, r); } + /** - * inputs is a list of one-hot vectors, size of each should be input_embedding_dimension. - * length of inputs is ngram-1. The first element of inputs is a matrix containing one-hot - * vectors of the first word in each ngram. The second element is a matrix containing one-hot vectors - * of the second word in each ngram. etc. - * - * output is a matrix containing one-hot vectors, where each vector is the last in each ngram. - * output is also a one-hot vector, size of output_embedding_dimension. - * - * @return + * This is primarily intended for character based n-grams. Convenient. + * @param chars needs to have length of ngram_size. + * @return log probability of ngram. + * @throws Exception + */ + public double ngram_prob(char[] chars) throws Exception { + List<String> lst = new ArrayList<>(); + for(char c : chars){ + lst.add(c + ""); + } + return ngram_prob(lst); + } + + /** + * get the probability of a single ngram + * @param ngramlist this is a list of items in the ngram. This needs to be of + * size ngram_size. For example, {'john', 'ate', 'cabbage'} + * for ngram_size=3 + * @return the log probability of this ngram + * @throws Exception need to fix this. */ public double ngram_prob(List<String> ngramlist) throws Exception { // the input here is a single ngram. Length of ngram if(ngramlist.size() != ngram_size){ - throw new Exception("ngmram-list must have the same size as ngram_size"); + throw new Exception("ngram-list must have the same size as ngram_size"); } int unkInd = this.word_to_index.get("<unk>"); @@ -153,6 +164,7 @@ public class NeuralLM { } RealVector r = new ArrayRealVector(this.n_input_vocab); r.setEntry(ind, 1); + // FIXME: this could be changed to a index, not a multiplication RealVector r_embed = this.input_embeddings.transpose().operate(r); concatenated = concatenated.append(r_embed); } @@ -347,7 +359,6 @@ public class NeuralLM { }else if(section.equals("\\output_vocab")){ // FIXME: not sure there is anything I need to do??? } else if(section.equals("\\input_embeddings")) { - logger.debug("got here"); m.input_embeddings = read_matrix(sectionlines, m.n_input_vocab, m.input_embedding_dimension); }else if(section.equals("\\hidden_weights 1")) { m.hidden1_weights = read_matrix(sectionlines, m.n_hidden, (m.ngram_size - 1) * m.input_embedding_dimension); diff --git a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java b/src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java similarity index 87% rename from src/main/java/edu/illinois/cs/cogcomp/NplmJni.java rename to src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java index ef4d07c..0e8654b 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java +++ b/src/main/java/edu/illinois/cs/cogcomp/lm/NplmJni.java @@ -1,4 +1,4 @@ -package edu.illinois.cs.cogcomp; +package edu.illinois.cs.cogcomp.lm; /** * Created by mayhew2 on 11/18/15. diff --git a/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java b/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java new file mode 100644 index 0000000..830671d --- /dev/null +++ b/src/main/java/edu/illinois/cs/cogcomp/lm/Tester.java @@ -0,0 +1,68 @@ +package edu.illinois.cs.cogcomp.lm; + +import edu.illinois.cs.cogcomp.core.io.LineIO; +import edu.illinois.cs.cogcomp.lm.NeuralLM; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created by mayhew2 on 11/19/15. + */ +public class Tester { + + public static void main(String[] args) throws Exception { + + NeuralLM m = NeuralLM.from_file("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/nplm-ru.txt"); + +// List<String> ngrams = new ArrayList<>(); + + //ngrams.add("Ñ‚"); + //ngrams.add("и"); + //ngrams.add("н"); + +// ngrams.add("a"); +// ngrams.add("b"); +// ngrams.add("н"); + + List<String> lines = LineIO.read("/home/mayhew2/IdeaProjects/illinois-transliteration/lm/ruwords.txt"); + + double total = 0; + + List<String> ngrams = new ArrayList<>(); + + for(String line : lines){ + line = "<s> " + line; + String[] sline = line.split(" "); + + for(int i = 0; i < sline.length-1; i++){ + ngrams.clear(); + + ngrams.add(sline[i]); + ngrams.add(sline[i+1]); + + + total += m.ngram_prob(ngrams); + } + + + } + System.out.println(total); + + System.out.println(); + String russian = "хруÑталь"; + + for(char c : russian.toCharArray()){ + ngrams.clear(); + ngrams.add(c + ""); + ngrams.add("</s>"); + + System.out.println(m.ngram_prob(ngrams)); + } + + + + + } + +} -- GitLab