diff --git a/question_parser/ParseQuestions.java b/question_parser/ParseQuestions.java new file mode 100644 index 0000000000000000000000000000000000000000..a9c617d7d346bb3c1f6c7ee9c2269861107c2b44 --- /dev/null +++ b/question_parser/ParseQuestions.java @@ -0,0 +1,179 @@ +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; +import java.util.List; +import java.util.ArrayList; +import java.io.StringReader; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException;; + +import edu.stanford.nlp.process.Tokenizer; +import edu.stanford.nlp.process.TokenizerFactory; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.DocumentPreprocessor; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.Sentence; +import edu.stanford.nlp.trees.*; +import edu.stanford.nlp.parser.lexparser.LexicalizedParser; + +class ParseQuestions { + public static void main(String[] args) { + String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; + LexicalizedParser lp = LexicalizedParser.loadModel(parserModel); + String textFile = args[0]; + try { + parseFile(lp, textFile); + } catch (IOException ex) { + + } + } + + public static boolean isAllUpperCase(String s) { + return s.toUpperCase().equals(s); + } + + public static void parseFile(LexicalizedParser lp, String filename) throws IOException { + TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English + GrammaticalStructureFactory gsf = null; + if (tlp.supportsGrammaticalStructures()) { + gsf = tlp.grammaticalStructureFactory(); + } + TokenizerFactory<CoreLabel> tokenizerFactory = + PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); + FileReader fr; + BufferedReader br; + + fr = new FileReader(filename); + br = new BufferedReader(fr); + + + String question; + //for (List<HasWord> sentence : new DocumentPreprocessor(filename)) { + while((question = br.readLine()) != null) { + String[] splits = question.trim().split("\\s++"); + if (isAllUpperCase(splits[1])) { // second word is all caps + question = question.toLowerCase(); + } + Tokenizer<CoreLabel> tok = + tokenizerFactory.getTokenizer(new StringReader(question)); + List<CoreLabel> rawWords = tok.tokenize(); + Tree parse = lp.apply(rawWords); + + Set<String> subjn = new HashSet<String>(); // subject noun + Set<String> othern = new HashSet<String>(); // all other nouns + List<String> firstTwo = new ArrayList<String>(); // first three words + List<String> allelse = new ArrayList<String>(); // everything non-noun + + for (Tree node : parse) { + String curr_label = node.label().value(); + // if(curr_label.equals("NP")) { + // System.out.println(node); + // } + // if it's any type of noune + if( curr_label.equals("NN") || curr_label.equals("NNS") + || curr_label.equals("NNP") || curr_label.equals("NNPS")) { + othern.add(node.firstChild().label().value()); + } + } + + int count = 0; + for (CoreLabel cl : rawWords ) { + if (count < 2) { + ++count; + firstTwo.add(cl.word()); + } + else if(!othern.contains(cl.word())) { + allelse.add(cl.word()); + } + } + //System.out.println("All nouns"); + //System.out.println(othern); + //parse.pennPrint(); + //System.out.println(); + + if (gsf != null) { + GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); + List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); + // System.out.println(tdl); + List<TypedDependency> compounds = new ArrayList<TypedDependency>(); + TypedDependency nsubjtd = null; + for (TypedDependency td: tdl) { + GrammaticalRelation reln = td.reln(); + if(reln.getShortName().equals("compound")) { + compounds.add(td); + } + if (reln.getShortName().equals("nsubj") || reln.getShortName().equals("nsubjpass")) { + nsubjtd = td; + } + } + + if (nsubjtd != null) { + //String nsubjgov = nsubjtd.gov().originalText(); + String nsubjdep = nsubjtd.dep().word(); + // boolean nsubj_found = false; + if (othern.contains(nsubjdep)) { + // System.out.println(nsubjdep); + othern.remove(nsubjdep); + subjn.add(nsubjdep); + // nsubj_found = true; + } + + /*if (!nsubj_found) { // check if the gov is a noun + String nsubjgov = nsubjtd.gov().originalText(); + if (othern.contains(nsubjgov)) { + // System.out.println(nsubjdep); + othern.remove(nsubjgov); + subjn.add(nsubjgov); + } + + }*/ + // now check for compound nouns that belong with the subjn + for (TypedDependency td : compounds) { + String gov = td.gov().word(); + String dep = td.dep().word(); + + if (subjn.contains(gov) && othern.contains(dep)) { + // System.out.println(nsubjgov); + othern.remove(dep); + subjn.add(dep); + } + if (othern.contains(dep) && subjn.contains(gov)) { + // System.out.println(nsubjdep); + othern.remove(gov); + subjn.add(gov); + } + } + } + // System.out.println("Subject noun words"); + /* System.out.println(subjn); + System.out.println(othern); + System.out.println(firstThree); + System.out.println(allelse); + */ + // print output + System.out.print("| "); + for (String s : firstTwo ) { + System.out.print(s + " "); + } + System.out.print("| "); + for (String s : allelse ) { + System.out.print(s + " "); + } + System.out.print("| "); + for (String s : subjn ) { + System.out.print(s + " "); + } + System.out.print("| "); + for (String s : othern ) { + System.out.print(s + " "); + } + System.out.println(" |");; + + } + } + + } +} diff --git a/question_parser/README.txt b/question_parser/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..5580825ba8593963f1732693eb49c1495cb2d98d --- /dev/null +++ b/question_parser/README.txt @@ -0,0 +1,16 @@ +Dependencies: +install the latest stanford parser: +$wget http://nlp.stanford.edu/software/stanford-parser-full-2015-12-09.zip +$unzip stanford-parser-full-2015-12-09.zip + + +Compile: +$sh compile.sh + + +Run example: +sh parse.sh example.txt + + +Output: +|first 2 words| other words | nsubj | other nounphrase words| diff --git a/question_parser/compile.sh b/question_parser/compile.sh new file mode 100644 index 0000000000000000000000000000000000000000..f3865f5c0cf2903573f5da72892b579b6cf010f2 --- /dev/null +++ b/question_parser/compile.sh @@ -0,0 +1 @@ +javac -cp ".:stanford-parser-full-2015-12-09/*" ParseQuestions.java diff --git a/question_parser/example.txt b/question_parser/example.txt new file mode 100644 index 0000000000000000000000000000000000000000..145eb23c0a4e5f1eff7d1b494934c7a8a266ad0b --- /dev/null +++ b/question_parser/example.txt @@ -0,0 +1,3 @@ +What color is my nail polish? +Is there a red car on the road? +What color is the light on the table in the bedroom?