Skip to content
Snippets Groups Projects
Commit 17380947 authored by Kevin Shih's avatar Kevin Shih
Browse files

added parser code from cvpr vqa paper

parent 067b012c
No related branches found
No related tags found
No related merge requests found
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.List;
import java.util.ArrayList;
import java.io.StringReader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
class ParseQuestions {
public static void main(String[] args) {
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);
String textFile = args[0];
try {
parseFile(lp, textFile);
} catch (IOException ex) {
}
}
public static boolean isAllUpperCase(String s) {
return s.toUpperCase().equals(s);
}
public static void parseFile(LexicalizedParser lp, String filename) throws IOException {
TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = null;
if (tlp.supportsGrammaticalStructures()) {
gsf = tlp.grammaticalStructureFactory();
}
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
FileReader fr;
BufferedReader br;
fr = new FileReader(filename);
br = new BufferedReader(fr);
String question;
//for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
while((question = br.readLine()) != null) {
String[] splits = question.trim().split("\\s++");
if (isAllUpperCase(splits[1])) { // second word is all caps
question = question.toLowerCase();
}
Tokenizer<CoreLabel> tok =
tokenizerFactory.getTokenizer(new StringReader(question));
List<CoreLabel> rawWords = tok.tokenize();
Tree parse = lp.apply(rawWords);
Set<String> subjn = new HashSet<String>(); // subject noun
Set<String> othern = new HashSet<String>(); // all other nouns
List<String> firstTwo = new ArrayList<String>(); // first three words
List<String> allelse = new ArrayList<String>(); // everything non-noun
for (Tree node : parse) {
String curr_label = node.label().value();
// if(curr_label.equals("NP")) {
// System.out.println(node);
// }
// if it's any type of noune
if( curr_label.equals("NN") || curr_label.equals("NNS")
|| curr_label.equals("NNP") || curr_label.equals("NNPS")) {
othern.add(node.firstChild().label().value());
}
}
int count = 0;
for (CoreLabel cl : rawWords ) {
if (count < 2) {
++count;
firstTwo.add(cl.word());
}
else if(!othern.contains(cl.word())) {
allelse.add(cl.word());
}
}
//System.out.println("All nouns");
//System.out.println(othern);
//parse.pennPrint();
//System.out.println();
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
// System.out.println(tdl);
List<TypedDependency> compounds = new ArrayList<TypedDependency>();
TypedDependency nsubjtd = null;
for (TypedDependency td: tdl) {
GrammaticalRelation reln = td.reln();
if(reln.getShortName().equals("compound")) {
compounds.add(td);
}
if (reln.getShortName().equals("nsubj") || reln.getShortName().equals("nsubjpass")) {
nsubjtd = td;
}
}
if (nsubjtd != null) {
//String nsubjgov = nsubjtd.gov().originalText();
String nsubjdep = nsubjtd.dep().word();
// boolean nsubj_found = false;
if (othern.contains(nsubjdep)) {
// System.out.println(nsubjdep);
othern.remove(nsubjdep);
subjn.add(nsubjdep);
// nsubj_found = true;
}
/*if (!nsubj_found) { // check if the gov is a noun
String nsubjgov = nsubjtd.gov().originalText();
if (othern.contains(nsubjgov)) {
// System.out.println(nsubjdep);
othern.remove(nsubjgov);
subjn.add(nsubjgov);
}
}*/
// now check for compound nouns that belong with the subjn
for (TypedDependency td : compounds) {
String gov = td.gov().word();
String dep = td.dep().word();
if (subjn.contains(gov) && othern.contains(dep)) {
// System.out.println(nsubjgov);
othern.remove(dep);
subjn.add(dep);
}
if (othern.contains(dep) && subjn.contains(gov)) {
// System.out.println(nsubjdep);
othern.remove(gov);
subjn.add(gov);
}
}
}
// System.out.println("Subject noun words");
/* System.out.println(subjn);
System.out.println(othern);
System.out.println(firstThree);
System.out.println(allelse);
*/
// print output
System.out.print("| ");
for (String s : firstTwo ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : allelse ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : subjn ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : othern ) {
System.out.print(s + " ");
}
System.out.println(" |");;
}
}
}
}
Dependencies:
install the latest stanford parser:
$wget http://nlp.stanford.edu/software/stanford-parser-full-2015-12-09.zip
$unzip stanford-parser-full-2015-12-09.zip
Compile:
$sh compile.sh
Run example:
sh parse.sh example.txt
Output:
|first 2 words| other words | nsubj | other nounphrase words|
javac -cp ".:stanford-parser-full-2015-12-09/*" ParseQuestions.java
What color is my nail polish?
Is there a red car on the road?
What color is the light on the table in the bedroom?
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment