Skip to content
Snippets Groups Projects
Commit 6db1d617 authored by tgupta6's avatar tgupta6
Browse files

Merge branch 'master' of gitlab-beta.engr.illinois.edu:Vision/GenVQA

parents fb7bf2b0 17380947
No related branches found
No related tags found
No related merge requests found
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.List;
import java.util.ArrayList;
import java.io.StringReader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
class ParseQuestions {
public static void main(String[] args) {
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);
String textFile = args[0];
try {
parseFile(lp, textFile);
} catch (IOException ex) {
}
}
public static boolean isAllUpperCase(String s) {
return s.toUpperCase().equals(s);
}
public static void parseFile(LexicalizedParser lp, String filename) throws IOException {
TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = null;
if (tlp.supportsGrammaticalStructures()) {
gsf = tlp.grammaticalStructureFactory();
}
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
FileReader fr;
BufferedReader br;
fr = new FileReader(filename);
br = new BufferedReader(fr);
String question;
//for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
while((question = br.readLine()) != null) {
String[] splits = question.trim().split("\\s++");
if (isAllUpperCase(splits[1])) { // second word is all caps
question = question.toLowerCase();
}
Tokenizer<CoreLabel> tok =
tokenizerFactory.getTokenizer(new StringReader(question));
List<CoreLabel> rawWords = tok.tokenize();
Tree parse = lp.apply(rawWords);
Set<String> subjn = new HashSet<String>(); // subject noun
Set<String> othern = new HashSet<String>(); // all other nouns
List<String> firstTwo = new ArrayList<String>(); // first three words
List<String> allelse = new ArrayList<String>(); // everything non-noun
for (Tree node : parse) {
String curr_label = node.label().value();
// if(curr_label.equals("NP")) {
// System.out.println(node);
// }
// if it's any type of noune
if( curr_label.equals("NN") || curr_label.equals("NNS")
|| curr_label.equals("NNP") || curr_label.equals("NNPS")) {
othern.add(node.firstChild().label().value());
}
}
int count = 0;
for (CoreLabel cl : rawWords ) {
if (count < 2) {
++count;
firstTwo.add(cl.word());
}
else if(!othern.contains(cl.word())) {
allelse.add(cl.word());
}
}
//System.out.println("All nouns");
//System.out.println(othern);
//parse.pennPrint();
//System.out.println();
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
// System.out.println(tdl);
List<TypedDependency> compounds = new ArrayList<TypedDependency>();
TypedDependency nsubjtd = null;
for (TypedDependency td: tdl) {
GrammaticalRelation reln = td.reln();
if(reln.getShortName().equals("compound")) {
compounds.add(td);
}
if (reln.getShortName().equals("nsubj") || reln.getShortName().equals("nsubjpass")) {
nsubjtd = td;
}
}
if (nsubjtd != null) {
//String nsubjgov = nsubjtd.gov().originalText();
String nsubjdep = nsubjtd.dep().word();
// boolean nsubj_found = false;
if (othern.contains(nsubjdep)) {
// System.out.println(nsubjdep);
othern.remove(nsubjdep);
subjn.add(nsubjdep);
// nsubj_found = true;
}
/*if (!nsubj_found) { // check if the gov is a noun
String nsubjgov = nsubjtd.gov().originalText();
if (othern.contains(nsubjgov)) {
// System.out.println(nsubjdep);
othern.remove(nsubjgov);
subjn.add(nsubjgov);
}
}*/
// now check for compound nouns that belong with the subjn
for (TypedDependency td : compounds) {
String gov = td.gov().word();
String dep = td.dep().word();
if (subjn.contains(gov) && othern.contains(dep)) {
// System.out.println(nsubjgov);
othern.remove(dep);
subjn.add(dep);
}
if (othern.contains(dep) && subjn.contains(gov)) {
// System.out.println(nsubjdep);
othern.remove(gov);
subjn.add(gov);
}
}
}
// System.out.println("Subject noun words");
/* System.out.println(subjn);
System.out.println(othern);
System.out.println(firstThree);
System.out.println(allelse);
*/
// print output
System.out.print("| ");
for (String s : firstTwo ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : allelse ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : subjn ) {
System.out.print(s + " ");
}
System.out.print("| ");
for (String s : othern ) {
System.out.print(s + " ");
}
System.out.println(" |");;
}
}
}
}
Dependencies:
install the latest stanford parser:
$wget http://nlp.stanford.edu/software/stanford-parser-full-2015-12-09.zip
$unzip stanford-parser-full-2015-12-09.zip
Compile:
$sh compile.sh
Run example:
sh parse.sh example.txt
Output:
|first 2 words| other words | nsubj | other nounphrase words|
javac -cp ".:stanford-parser-full-2015-12-09/*" ParseQuestions.java
What color is my nail polish?
Is there a red car on the road?
What color is the light on the table in the bedroom?
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment