parsing script for vqa test and testdev

949f26f5 · tgupta6 · 324b3beb · 949f26f5
Commit 949f26f5 authored 8 years ago by tgupta6
--- a/vqa_parser.py
+++ b/vqa_parser.py
@@ -55,6 +55,19 @@ def parse_annotations(input_json, output_json):
        ujson.dump(parsed_anno, file, indent=4, sort_keys=True)
            

+def parse_test_annotations(questions_json, output_json):
+    print 'Reading json file: {}'.format(questions_json)
+    with open(questions_json, 'r') as file:
+        data = ujson.load(file)
+
+    parsed_anno = dict()
+    for ques_data in data['questions']:
+        parsed_anno[ques_data['question_id']] = ques_data
+    
+    print 'Writing constructed dict to file: {}'.format(output_json)
+    with open(output_json, 'w') as file:
+        ujson.dump(parsed_anno, file, indent=4, sort_keys=True)
+
 def write_json_with_parsed_questions(
        input_json,
        questions_txt_filename,
@@ -111,6 +124,19 @@ def add_mcq_options(anno_json, questions_json, out_json):
    with open(out_json, 'w') as file:
        ujson.dump(anno, file, indent=4, sort_keys=True)

+def add_mcq_answer_to_test(anno_json):
+    print 'Reading json file: {}'.format(anno_json)
+    with open(anno_json, 'r') as file:
+        anno = ujson.load(file)
+
+    print 'Adding multiple_choice_answer (dummy) ...'
+    for key, val in anno.items():
+        val['multiple_choice_answer'] = val['multiple_choices'][0]
+
+    print 'Writing json file: {}'.format(anno_json)
+    with open(anno_json, 'w') as file:
+        ujson.dump(anno, file, indent=4, sort_keys=True)
+

 def add_noun_adjective_labels(
        anno_json, 
@@ -242,6 +268,18 @@ def list_of_val_question_ids(
    with open(val_qids_json,'w') as file:
        ujson.dump(qids, file)

+def list_of_test_question_ids(
+        json_anno,
+        test_qids_json):
+
+    with open(json_anno,'r') as file:
+        anno_data = ujson.load(file)
+    
+    qids = anno_data.keys()
+
+    with open(test_qids_json,'w') as file:
+        ujson.dump(qids, file)
+

 def counts_of_question_objects_and_attributes(
        json_anno,
@@ -318,34 +356,35 @@ def check_clash(vqa_hash, genome_hash):

 if __name__=='__main__':
    datadir = '/home/ssd/VQA/'
-    mode = 'val'
+    mode = 'test-dev'
+    year = '2015'
    questions_json_filename = os.path.join(
        datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions.json')
+        'MultipleChoice_mscoco_' + mode + year + '_questions.json')

    questions_txt_filename = os.path.join(
        datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions_dump.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_questions_dump.txt')

    question_ids_txt_filename = os.path.join(
        datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_question_ids_dump.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_question_ids_dump.txt')

    questions_parsed_txt_filename = os.path.join(
        datadir,
-        'MultipleChoice_mscoco_' + mode + '2014_questions_parsed.txt')
+        'MultipleChoice_mscoco_' + mode + year + '_questions_parsed.txt')
    
    annotations_json_filename = os.path.join(
        datadir,
-        'mscoco_' + mode + '2014_annotations.json')
+        'mscoco_' + mode + year + '_annotations.json')

    annotations_parsed_json_filename = os.path.join(
        datadir,
-        'mscoco_' + mode + '2014_annotations_parsed.json')
+        'mscoco_' + mode + year + '_annotations_parsed.json')

    annotations_with_parsed_questions_filename = os.path.join(
        datadir,
-        'mscoco_' + mode + '2014_annotations_with_parsed_questions.json')
+        'mscoco_' + mode + year + '_annotations_with_parsed_questions.json')

    answer_vocab_filename = os.path.join(
        datadir,
@@ -353,43 +392,51 @@ if __name__=='__main__':

    nouns_json_filename = os.path.join(
        datadir,
-        'mscoco_' + mode + '2014_question_nouns.json')
+        'mscoco_' + mode + year + '_question_nouns.json')

    adjectives_json_filename = os.path.join(
        datadir,
-        'mscoco_' + mode + '2014_question_adjectives.json')
+        'mscoco_' + mode + year + '_question_adjectives.json')

    ans_vocab_size = 5000

-    # dump_questions_to_txt(
-    #     questions_json_filename,
-    #     questions_txt_filename,
-    #     question_ids_txt_filename)
+    dump_questions_to_txt(
+        questions_json_filename,
+        questions_txt_filename,
+        question_ids_txt_filename)
    
-    # parse_questions(
-    #     questions_txt_filename,
-    #     questions_parsed_txt_filename)
-
-    # parse_annotations(
-    #     annotations_json_filename, 
-    #     annotations_parsed_json_filename)
-
-    # write_json_with_parsed_questions(
-    #     annotations_parsed_json_filename,
-    #     questions_parsed_txt_filename,
-    #     question_ids_txt_filename,
-    #     annotations_with_parsed_questions_filename)
+    parse_questions(
+        questions_txt_filename,
+        questions_parsed_txt_filename)
+
+    if 'test' in mode:
+        parse_test_annotations(
+            questions_json_filename, 
+            annotations_parsed_json_filename)
+    else:
+        parse_annotations(
+            annotations_json_filename, 
+            annotations_parsed_json_filename)
+
+    write_json_with_parsed_questions(
+        annotations_parsed_json_filename,
+        questions_parsed_txt_filename,
+        question_ids_txt_filename,
+        annotations_with_parsed_questions_filename)
    
-    # add_mcq_options(
-    #     annotations_with_parsed_questions_filename, 
-    #     questions_json_filename, 
-    #     annotations_with_parsed_questions_filename)
-
-    # add_noun_adjective_labels (
-    #     annotations_with_parsed_questions_filename, 
-    #     questions_txt_filename, 
-    #     question_ids_txt_filename,
-    #     annotations_with_parsed_questions_filename)
+    if 'test' in mode:
+        add_mcq_answer_to_test(annotations_with_parsed_questions_filename)
+    else:
+        add_mcq_options(
+            annotations_with_parsed_questions_filename, 
+            questions_json_filename, 
+            annotations_with_parsed_questions_filename)
+
+    add_noun_adjective_labels (
+        annotations_with_parsed_questions_filename, 
+        questions_txt_filename, 
+        question_ids_txt_filename,
+        annotations_with_parsed_questions_filename)

    # if mode=='train':
        # create_ans_vocab(
@@ -419,6 +466,15 @@ if __name__=='__main__':
        # list_of_val_question_ids(
        #     annotations_with_parsed_questions_filename,
        #     val_qids_json)
+
+    if 'test' in mode:
+        test_qids_json = os.path.join(
+            datadir,
+            mode + '_qids.json')
+
+        list_of_test_question_ids(
+            annotations_with_parsed_questions_filename,
+            test_qids_json)
        
    # counts_of_question_objects_and_attributes(
    #     annotations_with_parsed_questions_filename,
@@ -428,6 +484,6 @@ if __name__=='__main__':

    # generate_md5hash(datadir)

-    check_clash(
-        '/home/ssd/VQA/md5_hash_val2015.json',
-        '/home/ssd/VisualGenome/md5_hash.json')
+    # check_clash(
+    #     '/home/ssd/VQA/md5_hash_val2015.json',
+    #     '/home/ssd/VisualGenome/md5_hash.json')