From edb072c066e00b4fc519e96a63b332a9b26e7952 Mon Sep 17 00:00:00 2001 From: tgupta6 <tgupta6@illinois.edu> Date: Mon, 25 Apr 2016 13:57:55 -0500 Subject: [PATCH] parse questions and write to json file --- .gitignore | 4 +- question_parser/ParseQuestions.class | Bin 0 -> 5200 bytes shapes_dataset/parse_all_questions.py | 59 ++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 question_parser/ParseQuestions.class create mode 100644 shapes_dataset/parse_all_questions.py diff --git a/.gitignore b/.gitignore index a0a86f1..3f430ed 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ *.pyc shapes_dataset/images_old shapes_dataset/images -shapes_dataset/*.json \ No newline at end of file +shapes_dataset/*.json +shapes_dataset/*.txt +question_parser/stanford-parser-full-2015-12-09/* \ No newline at end of file diff --git a/question_parser/ParseQuestions.class b/question_parser/ParseQuestions.class new file mode 100644 index 0000000000000000000000000000000000000000..cd3891bfbacedbd646c364be73a7b52a61d11902 GIT binary patch literal 5200 zcmb_g3wRXO75?u`lDnCmz_NKUAOcb)AtbAasDol53Su{FB#9WIk4bitEX+QbodrS> zOY5uFcWo_Le4rQ~XnmB0m};q7+S*#}`~B3mzS`Q>wnBk;?(A-|VHeukeqS<k=H7Gv zbMCqSIcH{{Km6p=0Om4}f*_(E#5m##5?G_)1F$)g3S>Cqk`lvfy;z6!;?mEtL4hjP zd{n_kT*k3Ufe)8+d`y8Ko5khhV&Nw|xI)~o6p&X5>1vK^6imY9UVIYQdhsb-C%~`g zxIw{GL^*C$Fdb0^C*dY9ZpJO*@@bA+#mHy8_$+So;By{q@nEY5w~Nuwd$A37cyT9g z;<!t}$+%lQ(8xAy_u?Mht6&H2;}}pd5BDp001tZbkOw<GcvuWQqTmZ+%{DwL9v>^m z<9Nb@FDlrDG{<fQ&BCK*?D1ew+@BP>H+e85G`_^~6vxvHWoJcV5oZB|TvvZFLrHTy zY%+`wMq*|t740(ZcB9Lpk&>tpi80jFtq86%)*1oJi1h~A96J*0oi7$vv^hri>J}s6 z>T`UV4u)edoe0CYNOGZNbtDp|-E1TYth%lU$fXSB3ByjBizC9|1$99)oC+izBi0kQ z!-1HU2)NUBz%tiorJ%V!(rs9g4Q9AiEVbts0lt{yD;&>oJWFRz$IbB^$5-ieIQDUT zjl$3IbqX}cHyD)j)_0o;ClZg5<4RjRWp`&Cs&p+}lrod<Y?BCsiWY<z<`#Gwr7&8_ zJWtJ7uN6u5wKgwa(nNkX^=@GB9T`aqn2P7|O%<171w*AfABhK+UO3{Fif`cs72n2- zDqg~OXlp9Ii|?uUK7PQ^RNzR$j(3~MWS}{2n?a+?wA$mV&Ddh2+lkx#4B+^oiXY*} z4E~%o5&vbT5jJhHy`Rvb=VlhAdU{OT43EzKRK?GPKR?F{#F3TBh6WYCz%Nx?f=(5` z5~EeQ)fu*A*3x09_%(h*XQAS^cv;0O_#MLpS2yKEtibt3vaii_a#3){7TUJaAB-d& z6~D(H=<HOyD)#;wUT2tGuy@Bc&19gRt}5QZn;d^s@h9Pu6rw|uL!mbJ+De{>Lal8o z{*1SX6Dt0K7l|PZQwtz1;`?YURs2<K{%=D485M8i@5Ce(|G>)(Cl##Cy3?O9!{?d_ zGZr>u-Tf;5iFb&3D)tM{Jl*kVBA$wc8A@ZxRM#p>3iqBcl1UW@@Gp*oDh`R`KP(=G zRR+drOB<(gCaFvo2UhEbF)u|Ir6QI%PgcT8Nq?Hk%G@|i+o&=wR*V}<?lIiFuxpjs zEe@hc>NA{Lu+MC69wSQ`8VXj89#)qTTOFkGk}`VDRw^nCbB?0D#5ST)!x1Hb$d&4L zQnool?fQa6xm<8lfNF+4qI<%>*QFSZl2dSyYToF8%hXH;xlCQ^n6^PFcJ!SV^POQ< zfg_o?<Y_N6E!uLNigCFWH^MEV(lD$nQc{bYSN(e`stjHyo>#xC>go&YEcFs|P0FyS zZjPz%8T!nxr}|zcHdA=K0fIZqzT1l|n9ndJnMx$$wv)_@%LoI=0cvJsZgTIj_%ZM3 zR*Ru#xkAQIEbui)tk(8LBiv%(J-I5ox>zsE7G`Q;W;nkn7i(dK;mMgCu{Y}9nCR!V z=*lcbsG($vu-mvjHkEqO?x_k=JC9P~NhYia(R*eQO4S$Pl*^7_d|j@)tG%W(;ugca zLbLzDb|}ssN3<%+Q|e7Q!nw&MvXACVjYPtt9u~+N7@a#Nodj}ZBt2`!Am`GSba<bU z44LaG4ka=7HkR~RR+AT&=&IB!iqXk!7jD}n#KHzOI!!oh4J$=kReB<J(rNCaf_)Uw z(Mn06vJob|l6CZo<aUpkL+@#n0ktu6T`?Ln1dEU{5Avw7D~3i69OsN_Xc05n+?*w+ zDLk`Hi%L~fvFK#1=Hcc_8^H<^we?Yohm0siMGl(@%DiGIZV;DJH>EsUPGO+M(+e{6 zNF^#(wi%7D70|u$wPX5STYu6qDG%g$icY^eQ|TyaTIprip?orpDA^c!Im;WSQ%ndY zLMLughi7OkmP7@`GuxjDmPnsTSz@4zWyLU!K171_*#nf&KfQ5+;X;z8(4Ai4P&#Wj z*kdFxw9>WA9jrk)F2XXBg{ln+rF5U{mx2xQQ;_$dq-B;oCp5cp4@w76s+T-Qsy<X> zA^}J)(DC$k2zr*G(U;RCnmvx@r_#t2dYXndn$;i;(?AJFJ4Xixy=-@Y-YJ$-dlD~! zS^LuoQn*?^D-BNno}KiN@9qpzr<Yz*BX`T%Xs43^`8iT8Z(*U?gHRZ@Kq0})5E_Eo zxf>P!aeFXcFByWbvtc(T=%qvOcWR}(QK^?{WocA(>U^KPS>tI``)ksu)jgW0WX>K; zBq%Ye>0G1blR7oN8<PidoCLjGD<8xZ23?t@DQO(9dxtQUtWDF^oKP`@=`?bJJ}xJW z*Su-W(0!V35bqP7t`O3R40~~sK0%w1#>_#~5vX7D`|D}{TQQv^5s9s+C8@!cDoC15 z(iTkAD#WVB+`>ULNmxEGe78K$rzwL75EkKPtt}CfYwcv+U+pX1j+!fcOdF^9(>P@i zbHq|iCCJS4sS>urr;X2_9^TcsPikw_yn~oaQE+`eP50+ESjqG%&7ZMbtyO33hJ@tW zou}7m6yekLTCFyXGyG?|yQt7AwVI6m%&_im)0cUe@9J=^Mz0zeenhL<has&hjkD4? zTdyW(37b6KLh*6s1+)uI6{8{)T4lx=GMjZ~laR8`xL%wifF=rCE~G|_CuSjM5Tup| zPjVKXBo}^ken#Fp;R<17bxF#c%BIKhe*eNW7HQ*-4AJb75XpKCsSTl-sB^9e)_MNL zbV*|gEgTqrX#`{Z{{cfC#Q8*#(W80~epLK#7)#T*VBp{eQJCC%!WOwzWhND(I;t%7 zR8MB17PG06G~z^@f&k_~$LTl^XHe-nld6G^E}VrZ&W3{pxD4muN-V?;ScKc?7md4c zF8wUB82hk<zJY^yooeF&`W_#TR_4btR*g0`7wt?(2Rj=Vvld*!R-%(d=;PXg!4l|V z8_>-*!(`W@hh2|ewiA8q2}IaltYXi@Vy_^|UPYX}fi>(c`Z41GQc@Y#Nj|KXCgCdS zOk6EB;~J?2*GeX?lj68u>c@@J&A3^*9k)n3aI17bwn>lT4(UbQDZPffq&IQ5^ftE3 zmAFT4#J%##*dd>R`{d;qkk{aT*}((yMm#9rj)&x3*h#heVR<hem7k@bc&O&G(=hcA zYB^SNeDE+%;P?>7hdDlS2s7Op+&)Z!n2@Q(8LbhO%shv|(Z$it5qAGbrau$(VY$>D z?%0p21CS0;wWaZcXyWKOfXOs8Ob1u<-(nL;uWN#=^kog5>WUF<KV*V+O>`WDhh`6p uMnY<I?RW<b`^Bt}#t&f5F@#~_6s;BwsBqJQf~$y#CDe38e_ZX7(s~!XNs=T0 literal 0 HcmV?d00001 diff --git a/shapes_dataset/parse_all_questions.py b/shapes_dataset/parse_all_questions.py new file mode 100644 index 0000000..5346715 --- /dev/null +++ b/shapes_dataset/parse_all_questions.py @@ -0,0 +1,59 @@ +import collections +import os +import json +import sys +import pdb + + +if __name__=='__main__': + anno_json_file = sys.argv[1] + parsed_q_json_file = sys.argv[2] + + with open(anno_json_file,'r') as file: + anno_data = json.load(file) + + # write questions to a text file + q_txt_file = open('questions.txt', 'w') + q_id_file = open('question_ids.txt', 'w') + for item in anno_data: + # Get rid of last question mark while writing to file + q_txt_file.write(item['question'][:] + '\n') + q_id_file.write(str(item['question_id']) + '\n') + q_txt_file.close() + q_id_file.close() + os.system('wc -l questions.txt') + + os.chdir('../question_parser') + os.system( + "java -mx1000m -cp '.:./stanford-parser-full-2015-12-09/*' \ + ParseQuestions ./../shapes_dataset/questions.txt > \ + ./../shapes_dataset/parsed_questions.txt") + os.chdir('../shapes_dataset') + + parsed_q_file = open('parsed_questions.txt', 'r') + parsed_q_id_file = open('question_ids.txt', 'r') + parsed_questions = parsed_q_file.readlines() + question_ids = parsed_q_id_file.readlines() + parsed_q_file.close() + parsed_q_id_file.close() + + parsed_q_json_data = [] + for i, parsed_q in enumerate(parsed_questions): + splitted_line = parsed_q[2:-2].replace('?','').split('|') + parsed_q_json_data.append({ + 'question_id': int(question_ids[i][:-1]), + 'question_parse': { + 'bin0': splitted_line[0].rstrip().lstrip().split(' '), + 'bin1': splitted_line[1].rstrip().lstrip().split(' '), + 'bin2': splitted_line[2].rstrip().lstrip().split(' '), + 'bin3': splitted_line[3].rstrip().lstrip().split(' '), + } + }) + + + with open(parsed_q_json_file, 'w') as file: + json.dump(parsed_q_json_data, file, indent=4) + + + + -- GitLab