From 07a48c9b9d77022f89e17a4e118e15a25e59c58a Mon Sep 17 00:00:00 2001
From: tgupta6 <tgupta6@illinois.edu>
Date: Mon, 24 Oct 2016 12:00:30 -0500
Subject: [PATCH] parsing script for vqa test and testdev

---
 vqa_parser.py | 136 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 96 insertions(+), 40 deletions(-)

diff --git a/vqa_parser.py b/vqa_parser.py
index 0bc478f..7ca03ea 100644
--- a/vqa_parser.py
+++ b/vqa_parser.py
@@ -55,6 +55,19 @@ def parse_annotations(input_json, output_json):
         ujson.dump(parsed_anno, file, indent=4, sort_keys=True)
             
 
+def parse_test_annotations(questions_json, output_json):
+    print 'Reading json file: {}'.format(questions_json)
+    with open(questions_json, 'r') as file:
+        data = ujson.load(file)
+
+    parsed_anno = dict()
+    for ques_data in data['questions']:
+        parsed_anno[ques_data['question_id']] = ques_data
+    
+    print 'Writing constructed dict to file: {}'.format(output_json)
+    with open(output_json, 'w') as file:
+        ujson.dump(parsed_anno, file, indent=4, sort_keys=True)
+
 def write_json_with_parsed_questions(
         input_json,
         questions_txt_filename,
@@ -111,6 +124,19 @@ def add_mcq_options(anno_json, questions_json, out_json):
     with open(out_json, 'w') as file:
         ujson.dump(anno, file, indent=4, sort_keys=True)
 
+def add_mcq_answer_to_test(anno_json):
+    print 'Reading json file: {}'.format(anno_json)
+    with open(anno_json, 'r') as file:
+        anno = ujson.load(file)
+
+    print 'Adding multiple_choice_answer (dummy) ...'
+    for key, val in anno.items():
+        val['multiple_choice_answer'] = val['multiple_choices'][0]
+
+    print 'Writing json file: {}'.format(anno_json)
+    with open(anno_json, 'w') as file:
+        ujson.dump(anno, file, indent=4, sort_keys=True)
+
 
 def add_noun_adjective_labels(
         anno_json, 
@@ -242,6 +268,18 @@ def list_of_val_question_ids(
     with open(val_qids_json,'w') as file:
         ujson.dump(qids, file)
 
+def list_of_test_question_ids(
+        json_anno,
+        test_qids_json):
+
+    with open(json_anno,'r') as file:
+        anno_data = ujson.load(file)
+    
+    qids = anno_data.keys()
+
+    with open(test_qids_json,'w') as file:
+        ujson.dump(qids, file)
+
 
 def counts_of_question_objects_and_attributes(
         json_anno,
@@ -318,34 +356,35 @@ def check_clash(vqa_hash, genome_hash):
 
 if __name__=='__main__':
     datadir = '/home/ssd/VQA/'
-    mode = 'val'
+    mode = 'test-dev'
+    year = '2015'
     questions_json_filename = os.path.join(
         datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions.json')
+        'MultipleChoice_mscoco_' + mode + year + '_questions.json')
 
     questions_txt_filename = os.path.join(
         datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions_dump.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_questions_dump.txt')
 
     question_ids_txt_filename = os.path.join(
         datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_question_ids_dump.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_question_ids_dump.txt')
 
     questions_parsed_txt_filename = os.path.join(
         datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions_parsed.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_questions_parsed.txt')
     
     annotations_json_filename = os.path.join(
         datadir,
-        'mscoco_' + mode + '2014_annotations.json')
+        'mscoco_' + mode + year + '_annotations.json')
 
     annotations_parsed_json_filename = os.path.join(
         datadir,
-        'mscoco_' + mode + '2014_annotations_parsed.json')
+        'mscoco_' + mode + year + '_annotations_parsed.json')
 
     annotations_with_parsed_questions_filename = os.path.join(
         datadir,
-        'mscoco_' + mode + '2014_annotations_with_parsed_questions.json')
+        'mscoco_' + mode + year + '_annotations_with_parsed_questions.json')
 
     answer_vocab_filename = os.path.join(
         datadir,
@@ -353,43 +392,51 @@ if __name__=='__main__':
 
     nouns_json_filename = os.path.join(
         datadir,
-        'mscoco_' + mode + '2014_question_nouns.json')
+        'mscoco_' + mode + year + '_question_nouns.json')
 
     adjectives_json_filename = os.path.join(
         datadir,
-        'mscoco_' + mode + '2014_question_adjectives.json')
+        'mscoco_' + mode + year + '_question_adjectives.json')
 
     ans_vocab_size = 5000
 
-    # dump_questions_to_txt(
-    #     questions_json_filename,
-    #     questions_txt_filename,
-    #     question_ids_txt_filename)
+    dump_questions_to_txt(
+        questions_json_filename,
+        questions_txt_filename,
+        question_ids_txt_filename)
     
-    # parse_questions(
-    #     questions_txt_filename,
-    #     questions_parsed_txt_filename)
-
-    # parse_annotations(
-    #     annotations_json_filename, 
-    #     annotations_parsed_json_filename)
-
-    # write_json_with_parsed_questions(
-    #     annotations_parsed_json_filename,
-    #     questions_parsed_txt_filename,
-    #     question_ids_txt_filename,
-    #     annotations_with_parsed_questions_filename)
+    parse_questions(
+        questions_txt_filename,
+        questions_parsed_txt_filename)
+
+    if 'test' in mode:
+        parse_test_annotations(
+            questions_json_filename, 
+            annotations_parsed_json_filename)
+    else:
+        parse_annotations(
+            annotations_json_filename, 
+            annotations_parsed_json_filename)
+
+    write_json_with_parsed_questions(
+        annotations_parsed_json_filename,
+        questions_parsed_txt_filename,
+        question_ids_txt_filename,
+        annotations_with_parsed_questions_filename)
     
-    # add_mcq_options(
-    #     annotations_with_parsed_questions_filename, 
-    #     questions_json_filename, 
-    #     annotations_with_parsed_questions_filename)
-
-    # add_noun_adjective_labels (
-    #     annotations_with_parsed_questions_filename, 
-    #     questions_txt_filename, 
-    #     question_ids_txt_filename,
-    #     annotations_with_parsed_questions_filename)
+    if 'test' in mode:
+        add_mcq_answer_to_test(annotations_with_parsed_questions_filename)
+    else:
+        add_mcq_options(
+            annotations_with_parsed_questions_filename, 
+            questions_json_filename, 
+            annotations_with_parsed_questions_filename)
+
+    add_noun_adjective_labels (
+        annotations_with_parsed_questions_filename, 
+        questions_txt_filename, 
+        question_ids_txt_filename,
+        annotations_with_parsed_questions_filename)
 
     # if mode=='train':
         # create_ans_vocab(
@@ -419,6 +466,15 @@ if __name__=='__main__':
         # list_of_val_question_ids(
         #     annotations_with_parsed_questions_filename,
         #     val_qids_json)
+
+    if 'test' in mode:
+        test_qids_json = os.path.join(
+            datadir,
+            mode + '_qids.json')
+
+        list_of_test_question_ids(
+            annotations_with_parsed_questions_filename,
+            test_qids_json)
         
     # counts_of_question_objects_and_attributes(
     #     annotations_with_parsed_questions_filename,
@@ -428,6 +484,6 @@ if __name__=='__main__':
 
     # generate_md5hash(datadir)
 
-    check_clash(
-        '/home/ssd/VQA/md5_hash_val2015.json',
-        '/home/ssd/VisualGenome/md5_hash.json')
+    # check_clash(
+    #     '/home/ssd/VQA/md5_hash_val2015.json',
+    #     '/home/ssd/VisualGenome/md5_hash.json')
-- 
GitLab