added parser code from cvpr vqa paper

17380947 · Kevin Shih · 067b012c · 17380947 · 17380947 · 17380947
Commit 17380947 authored 9 years ago by Kevin Shih
--- a/question_parser/ParseQuestions.java
+++ b/question_parser/ParseQuestions.java
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
+import java.io.StringReader;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;;
+
+import edu.stanford.nlp.process.Tokenizer;
+import edu.stanford.nlp.process.TokenizerFactory;
+import edu.stanford.nlp.process.CoreLabelTokenFactory;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Sentence;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+
+class ParseQuestions {
+    public static void main(String[] args) {
+	String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
+	LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);
+	String textFile = args[0];
+	try {
+	parseFile(lp, textFile);
+	} catch (IOException ex) {
+
+	}
+    }
+
+    public static boolean isAllUpperCase(String s) {
+	return s.toUpperCase().equals(s);
+    }
+
+    public static void parseFile(LexicalizedParser lp, String filename) throws IOException {
+	TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
+	GrammaticalStructureFactory gsf = null;
+	if (tlp.supportsGrammaticalStructures()) {
+	    gsf = tlp.grammaticalStructureFactory();
+	}
+	TokenizerFactory<CoreLabel> tokenizerFactory =
+	    PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
+	FileReader fr;
+	BufferedReader br;
+
+	fr = new FileReader(filename);
+	br = new BufferedReader(fr);
+
+
+	String question;
+	//for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
+	while((question = br.readLine()) != null) {
+	    String[] splits = question.trim().split("\\s++");
+	    if (isAllUpperCase(splits[1])) { // second word is all caps
+		question = question.toLowerCase();
+	    }
+	    Tokenizer<CoreLabel> tok =
+		tokenizerFactory.getTokenizer(new StringReader(question));
+	    List<CoreLabel> rawWords = tok.tokenize();
+	    Tree parse = lp.apply(rawWords);
+
+	    Set<String> subjn = new HashSet<String>(); // subject noun
+	    Set<String> othern = new HashSet<String>(); // all other nouns
+	    List<String> firstTwo = new ArrayList<String>(); // first three words
+	    List<String> allelse = new ArrayList<String>(); // everything non-noun
+
+	    for (Tree node : parse) {
+		String curr_label = node.label().value();
+	        // if(curr_label.equals("NP")) {
+		//     System.out.println(node);
+		// }
+		// if it's any type of noune
+		if( curr_label.equals("NN") || curr_label.equals("NNS")
+		    || curr_label.equals("NNP") || curr_label.equals("NNPS")) {
+		    othern.add(node.firstChild().label().value());
+		}
+	    }
+
+	    int count = 0;
+	    for (CoreLabel cl : rawWords ) {
+		if (count < 2) {
+		    ++count;
+		    firstTwo.add(cl.word());
+		}
+		else if(!othern.contains(cl.word())) {
+		    allelse.add(cl.word());
+		}
+	    }
+	    //System.out.println("All nouns");
+	    //System.out.println(othern);
+	    //parse.pennPrint();
+	    //System.out.println();
+
+	    if (gsf != null) {
+		GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+		List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
+		//		System.out.println(tdl);
+		List<TypedDependency> compounds = new ArrayList<TypedDependency>();
+		TypedDependency nsubjtd = null;
+		for (TypedDependency td: tdl) {
+		    GrammaticalRelation reln = td.reln();
+		    if(reln.getShortName().equals("compound")) {
+			compounds.add(td);
+		    }
+		    if (reln.getShortName().equals("nsubj") || reln.getShortName().equals("nsubjpass")) {
+			nsubjtd = td;
+		    }
+		}
+
+		if (nsubjtd != null) {
+		    //String nsubjgov = nsubjtd.gov().originalText();
+		    String nsubjdep = nsubjtd.dep().word();
+		    //		    boolean nsubj_found = false;
+		    if (othern.contains(nsubjdep)) {
+			//		    System.out.println(nsubjdep);
+			othern.remove(nsubjdep);
+			subjn.add(nsubjdep);
+			//			nsubj_found = true;
+		    }
+
+		    /*if (!nsubj_found) { // check if the gov is a noun
+			String nsubjgov = nsubjtd.gov().originalText();
+			if (othern.contains(nsubjgov)) {
+			    //		    System.out.println(nsubjdep);
+			    othern.remove(nsubjgov);
+			    subjn.add(nsubjgov);
+			}
+
+			}*/
+		    // now check for compound nouns that belong with the subjn
+		    for (TypedDependency td : compounds) {
+			String gov = td.gov().word();
+			String dep = td.dep().word();
+
+			if (subjn.contains(gov) && othern.contains(dep)) {
+			    //		    System.out.println(nsubjgov);
+			    othern.remove(dep);
+			    subjn.add(dep);
+			}
+			if (othern.contains(dep) && subjn.contains(gov)) {
+			    //		    System.out.println(nsubjdep);
+			    othern.remove(gov);
+			    subjn.add(gov);
+			}
+		    }
+		}
+		//		System.out.println("Subject noun words");
+		/*		System.out.println(subjn);
+				System.out.println(othern);
+				System.out.println(firstThree);
+				System.out.println(allelse);
+		*/
+		// print output
+		System.out.print("| ");
+		for (String s : firstTwo ) {
+		    System.out.print(s + " ");
+		}
+		System.out.print("| ");
+		for (String s : allelse ) {
+		    System.out.print(s + " ");
+		}
+		System.out.print("| ");
+		for (String s : subjn ) {
+		    System.out.print(s + " ");
+		}
+		System.out.print("| ");
+		for (String s : othern ) {
+		    System.out.print(s + " ");
+		}
+		System.out.println(" |");;
+
+	    }
+	}
+
+    }
+}
--- a/question_parser/README.txt
+++ b/question_parser/README.txt
+Dependencies:
+install the latest stanford parser:
+$wget http://nlp.stanford.edu/software/stanford-parser-full-2015-12-09.zip
+$unzip stanford-parser-full-2015-12-09.zip
+
+
+Compile:
+$sh compile.sh
+
+
+Run example:
+sh parse.sh example.txt
+
+
+Output:
+|first 2 words| other words | nsubj | other nounphrase words| 
--- a/question_parser/compile.sh
+++ b/question_parser/compile.sh
+javac -cp ".:stanford-parser-full-2015-12-09/*" ParseQuestions.java
--- a/question_parser/example.txt
+++ b/question_parser/example.txt
+What color is my nail polish?
+Is there a red car on the road?
+What color is the light on the table in the bedroom?