From 5b71bdb802d18c457bf811ace5dba4ea45e858d9 Mon Sep 17 00:00:00 2001 From: tgupta6 <tgupta6@illinois.edu> Date: Mon, 2 May 2016 10:24:00 -0500 Subject: [PATCH] relevance network with explicit features, ans network with margin loss and wordvec sharing --- .../answer_classifier/ans_data_io_helper.py | 43 ++- .../answer_classifier/eval_ans_classifier.py | 90 +++--- .../answer_classifier/train_ans_classifier.py | 97 +++--- classifiers/inherit_example.py | 14 + .../region_ranker/eval_rel_classifier.py | 28 +- .../region_ranker/train_rel_classifier.py | 92 +++--- classifiers/tf_graph_creation_helper.py | 289 +++++++++++++++--- classifiers/train_classifiers.py | 32 +- 8 files changed, 493 insertions(+), 192 deletions(-) create mode 100644 classifiers/inherit_example.py diff --git a/classifiers/answer_classifier/ans_data_io_helper.py b/classifiers/answer_classifier/ans_data_io_helper.py index 50ace34..d5e419b 100644 --- a/classifiers/answer_classifier/ans_data_io_helper.py +++ b/classifiers/answer_classifier/ans_data_io_helper.py @@ -81,6 +81,17 @@ def get_vocab(qa_dict): return vocab, inv_vocab +def join_vocab(vocab, ans_vocab): + joint_vocab = vocab.copy() + count = len(joint_vocab) + for word in ans_vocab.keys(): + if word not in joint_vocab: + joint_vocab[word] = count + count += 1 + + return joint_vocab + + def save_regions(image_dir, out_dir, qa_dict, region_anno_dict, start_id, batch_size, img_width, img_height): @@ -91,7 +102,7 @@ def save_regions(image_dir, out_dir, qa_dict, region_anno_dict, start_id, region_shape = np.array([img_height/3, img_width/3], np.int32) image_done = dict() - for i in xrange(batch_size): + for i in xrange(start_id, start_id + batch_size): image_id = qa_dict[i].image_id image_done[image_id] = False @@ -228,9 +239,9 @@ atr_labels = { } -class feed_dict_creator(): - def __init__(self, region_images, ans_labels, parsed_q, - region_score, keep_prob, plholder_dict, vocab): +class FeedDictCreator(): + def __init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab): self.plholder_dict = plholder_dict self.parsed_q = parsed_q self.vocab = vocab @@ -238,8 +249,6 @@ class feed_dict_creator(): self.feed_dict = { plholder_dict['image_regions']: region_images, plholder_dict['keep_prob']: keep_prob, - plholder_dict['gt_answer']: ans_labels, - plholder_dict['region_score']: region_score, } self.add_bin('bin0') self.add_bin('bin1') @@ -281,13 +290,31 @@ class feed_dict_creator(): containment = np.zeros([num_q, num_labels], dtype='float32') for q_num in xrange(num_q): for i, label in labels.items(): - if label in [pq.lower() for pq in self.parsed_q[q_num][bin_name]]: + if label in [pq.lower() for pq in \ + self.parsed_q[q_num][bin_name]]: containment[q_num,i] = 1 plholder = self.plholder_dict[bin_name + '_' + \ label_type + '_' + 'cont'] self.feed_dict[plholder] = containment - + + +class RelFeedDictCreator(FeedDictCreator): + def __init__(self, region_images, parsed_q, + gt_region_scores, keep_prob, plholder_dict, vocab): + FeedDictCreator.__init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab) + self.feed_dict[plholder_dict['gt_scores']] = gt_region_scores + + +class AnsFeedDictCreator(FeedDictCreator): + def __init__(self, region_images, ans_labels, parsed_q, + region_scores, keep_prob, plholder_dict, vocab): + FeedDictCreator.__init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab) + self.feed_dict[plholder_dict['gt_answer']] = ans_labels + self.feed_dict[plholder_dict['region_score']] = region_scores + class html_ans_table_writer(): def __init__(self, filename): diff --git a/classifiers/answer_classifier/eval_ans_classifier.py b/classifiers/answer_classifier/eval_ans_classifier.py index f0c6605..248336b 100644 --- a/classifiers/answer_classifier/eval_ans_classifier.py +++ b/classifiers/answer_classifier/eval_ans_classifier.py @@ -15,9 +15,9 @@ import region_ranker.perfect_ranker as region_proposer import train_ans_classifier as ans_trainer from PIL import Image, ImageDraw -def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, +def get_pred(y, qa_anno_dict, region_anno_dict, parsed_q_dict, ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, batch_size, - placeholders, img_height, img_width, batch_creator): + plholder_dict, img_height, img_width, batch_creator): inv_ans_vocab = {v: k for k, v in ans_vocab.items()} pred_list = [] @@ -30,14 +30,14 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, print('Iter: ' + str(i+1) + '/' + str(max_iter)) - region_images, ans_labels, questions, \ + region_images, ans_labels, parsed_q, \ region_score, partition = batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index+i*batch_size, - batch_size_tmp, + batch_size_tmp, parsed_q_dict, img_height, img_width, 3) if i==max_iter-1: @@ -48,8 +48,9 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, residual_region_images = np.zeros(shape=[residual_regions, img_height/3, img_width/3, 3]) - residual_questions = np.zeros(shape=[residual_regions, - len(vocab)]) + # residual_questions = np.zeros(shape=[residual_regions, + # len(vocab)]) + residual_ans_labels = np.zeros(shape=[residual_batch_size, len(ans_vocab)]) residual_region_score = np.zeros(shape=[1, residual_regions]) @@ -57,19 +58,29 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, region_images = np.concatenate((region_images, residual_region_images), axis=0) - questions = np.concatenate((questions, residual_questions), axis=0) +# questions = np.concatenate((questions, residual_questions), axis=0) + for k in xrange(batch_size_tmp*22, batch_size*22): + parsed_q[k] = { + 'bin0': [''], + 'bin1': [''], + 'bin2': [''], + 'bin3': [''], + } + ans_labels = np.concatenate((ans_labels, residual_ans_labels), axis=0) region_score = np.concatenate((region_score, residual_region_score), axis=1) - feed_dict = { - placeholders[0] : region_images, - placeholders[1] : questions, - placeholders[2] : 1.0, - placeholders[3] : ans_labels, - placeholders[4] : region_score, - } + + feed_dict = ans_io_helper \ + .AnsFeedDictCreator(region_images, + ans_labels, + parsed_q, + region_score, + 1.0, + plholder_dict, + vocab).feed_dict ans_ids = np.argmax(y.eval(feed_dict), 1) for j in xrange(batch_size_tmp): @@ -78,13 +89,6 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, 'answer' : inv_ans_vocab[ans_ids[j]] }] - # g = tf.get_default_graph() - # q_feat_op = g.get_operation_by_name('ans/word_embed/q_feat') - # q_feat = q_feat_op.outputs[0] - # region_feat_op = g.get_operation_by_name('ans/conv2/region_feat') - # region_feat = region_feat_op.outputs[0] - # pdb.set_trace() - return pred_list def eval(eval_params): @@ -92,6 +96,7 @@ def eval(eval_params): train_anno_filename = eval_params['train_json'] test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] regions_anno_filename = eval_params['regions_json'] image_regions_dir = eval_params['image_regions_dir'] outdir = eval_params['outdir'] @@ -104,38 +109,47 @@ def eval(eval_params): qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) # Create graph g = tf.get_default_graph() - image_regions, questions, keep_prob, y, region_score= \ - graph_creator.placeholder_inputs_ans(len(vocab), len(ans_vocab), - mode='gt') - + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - pred_rel_score = graph_creator.rel_comp_graph(image_regions, questions, + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, 'q_obj_atr_reg', 1.0, len(vocab), batch_size) - y_pred = graph_creator.ans_comp_graph(image_regions, questions, keep_prob, - obj_feat, atr_feat, vocab, - inv_vocab, len(ans_vocab), - eval_params['mode']) + y_pred = graph_creator.ans_comp_margin_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + vocab, inv_vocab, ans_vocab, + eval_params['mode']) pred_rel_score_vec = tf.reshape(pred_rel_score, [1, batch_size*ans_io_helper.num_proposals]) + y_avg = graph_creator.aggregate_y_pred(y_pred, pred_rel_score_vec, batch_size, ans_io_helper.num_proposals, len(ans_vocab)) - cross_entropy = graph_creator.loss(y, y_avg) accuracy = graph_creator.evaluation(y, y_avg) # Collect variables @@ -160,16 +174,15 @@ def eval(eval_params): mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ 'Obj_Classifier/mean_image.npy') - placeholders = [image_regions, questions, keep_prob, y, region_score] - # Batch creator test_batch_creator = ans_io_helper.batch_creator(test_start_id, test_start_id + test_set_size - 1) # Get predictions - pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, ans_vocab, + pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, test_start_id, - test_set_size, batch_size, placeholders, 75, 75, + test_set_size, batch_size, plholder_dict, 75, 75, test_batch_creator) json_filename = os.path.join(outdir, 'predicted_ans_' + \ @@ -271,11 +284,12 @@ if __name__=='__main__': 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel', - 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob/rel_classifier_q_obj_atr_reg-4', - 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel/ans_classifier_' + mode + '-' + str(model_num), + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt/rel_classifier_q_obj_atr_reg_explt-9', + 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin/ans_classifier_' + mode + '-' + str(model_num), 'mode' : mode, 'batch_size': 20, 'test_start_id': 94645, diff --git a/classifiers/answer_classifier/train_ans_classifier.py b/classifiers/answer_classifier/train_ans_classifier.py index d1dcc3b..f57515c 100644 --- a/classifiers/answer_classifier/train_ans_classifier.py +++ b/classifiers/answer_classifier/train_ans_classifier.py @@ -31,9 +31,12 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): 'ans/fc1/W_obj', 'ans/fc1/W_atr', 'ans/fc1/W_q', + 'ans/fc1/W_explt', 'ans/fc1/b', - 'ans/fc2/W', - 'ans/fc2/b' + 'ans/fc2/W_feat', + 'ans/fc2/b_feat', + 'ans/fc2/W_ans', + 'ans/fc2/b_ans' ] vars_dict = graph_creator.get_list_of_variables(list_of_vars) @@ -43,8 +46,10 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): vars_dict['ans/word_embed/word_vecs'], vars_dict['ans/fc1/W_q'], vars_dict['ans/fc1/b'], - vars_dict['ans/fc2/W'], - vars_dict['ans/fc2/b'], + vars_dict['ans/fc2/W_feat'], + vars_dict['ans/fc2/b_feat'], + vars_dict['ans/fc2/W_ans'], + vars_dict['ans/fc2/b_ans'], ] reg_ans_params = [ @@ -57,6 +62,7 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): obj_ans_params = [ vars_dict['ans/fc1/W_obj'], + vars_dict['ans/fc1/W_explt'] ] atr_ans_params = [ @@ -88,9 +94,9 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): elif mode=='q_obj_atr_reg': vars_to_train += reg_ans_params - if not mode=='q': - vars_to_train = [var for var in vars_to_train if \ - 'ans/word_embed/word_vecs' not in var.name] + # if not mode=='q': + # vars_to_train = [var for var in vars_to_train if \ + # 'ans/word_embed/word_vecs' not in var.name] # Fine tune begining with a previous model if fine_tune==True: @@ -148,9 +154,9 @@ def evaluate(accuracy, qa_anno_dict, region_anno_dict, ans_vocab, vocab, img_height, img_width, 3) feed_dict = ans_io_helper.\ - feed_dict_creator(region_images, ans_labels, parsed_q, - region_score, 1.0, plholder_dict, - vocab).feed_dict + AnsFeedDictCreator(region_images, ans_labels, parsed_q, + region_score, 1.0, plholder_dict, + vocab).feed_dict correct = correct + accuracy.eval(feed_dict) @@ -179,6 +185,7 @@ def train(train_params): region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) +# vocab = ans_io_helper.join_vocab(vocab, ans_vocab) # Save region crops if train_params['crop_n_save_regions'] == True: @@ -209,39 +216,38 @@ def train(train_params): atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - # pred_rel_score = graph_creator.rel_comp_graph(image_regions, questions, - # obj_feat, atr_feat, - # 'q_obj_atr_reg', 1.0, - # len(vocab), batch_size) - + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + 'q_obj_atr_reg_explt', + 1.0, len(vocab), batch_size) # Restore rel, obj and attribute classifier parameters -# rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') - # rel_saver = tf.train.Saver(rel_vars) + rel_saver = tf.train.Saver(rel_vars) obj_atr_saver = tf.train.Saver(obj_vars+atr_vars) - # rel_saver.restore(sess, rel_model) + rel_saver.restore(sess, rel_model) obj_atr_saver.restore(sess, obj_atr_model) - y_pred = graph_creator.ans_comp_graph(plholder_dict, - obj_feat, atr_feat, vocab, - inv_vocab, len(ans_vocab), - train_params['mode']) -# pred_rel_score_vec = tf.reshape(pred_rel_score, - # [1, batch_size*ans_io_helper.num_proposals]) - # y_avg = graph_creator.aggregate_y_pred(y_pred, - # pred_rel_score_vec, batch_size, - # ans_io_helper.num_proposals, - # len(ans_vocab)) + y_pred = graph_creator.ans_comp_margin_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + vocab, inv_vocab, ans_vocab, + train_params['mode']) + pred_rel_score_vec = tf.reshape(pred_rel_score, + [1, batch_size*ans_io_helper.num_proposals]) + y_avg = graph_creator.aggregate_y_pred(y_pred, - region_score, batch_size, + pred_rel_score_vec, batch_size, ans_io_helper.num_proposals, len(ans_vocab)) - cross_entropy = graph_creator.loss(y, y_avg) +# cross_entropy = graph_creator.loss(y, y_avg) + margin_loss = graph_creator.margin_loss(y, y_avg, 0.2) accuracy = graph_creator.evaluation(y, y_avg) # Collect variables @@ -249,7 +255,7 @@ def train(train_params): pretrained_vars, vars_to_train, vars_to_restore, vars_to_save, \ vars_to_init, vars_dict = \ get_process_flow_vars(train_params['mode'], - obj_vars, atr_vars, [], #rel_vars, + obj_vars, atr_vars, rel_vars, train_params['fine_tune']) # Regularizers @@ -265,9 +271,11 @@ def train(train_params): vars_dict['ans/fc1/W_obj'], vars_dict['ans/fc1/W_atr'], vars_dict['ans/fc1/W_q'], + vars_dict['ans/fc1/W_explt'], ] - ans_fc2_params = [vars_dict['ans/fc2/W']] + ans_fc2_params = [vars_dict['ans/fc2/W_feat'], + vars_dict['ans/fc2/W_ans']] regularizer_ans_word_vecs = graph_creator \ .regularize_params(ans_word_vec_params) @@ -277,7 +285,12 @@ def train(train_params): regularizer_ans_fcs = graph_creator \ .regularize_params(ans_fc1_params + ans_fc2_params) - total_loss = cross_entropy + \ + # total_loss = margin_loss + \ + # 1e-5 * regularizer_ans_word_vecs + \ + # 1e-5 * regularizer_ans_fcs + \ + # 1e-3 * regularizer_ans_filters + + total_loss = margin_loss + \ 1e-5 * regularizer_ans_word_vecs + \ 1e-5 * regularizer_ans_fcs + \ 1e-3 * regularizer_ans_filters @@ -323,7 +336,7 @@ def train(train_params): # Initialize vars_to_init all_vars = tf.get_collection(tf.GraphKeys.VARIABLES) optimizer_vars = [var for var in all_vars if var not in \ - obj_vars + atr_vars + ans_vars] #rel_vars + ans_vars] + obj_vars + atr_vars + rel_vars + ans_vars] print('Optimizer Variables: ') print([var.name for var in optimizer_vars]) @@ -334,7 +347,7 @@ def train(train_params): mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ 'Obj_Classifier/mean_image.npy') - placeholders = [image_regions, questions, keep_prob, y, region_score] +# placeholders = [image_regions, questions, keep_prob, y, region_score] # Start Training max_epoch = train_params['max_epoch'] @@ -381,13 +394,13 @@ def train(train_params): 75, 75, 3) feed_dict_train = ans_io_helper \ - .feed_dict_creator(train_region_images, - train_ans_labels, - train_parsed_q, - train_region_score, - 0.5, - plholder_dict, - vocab).feed_dict + .AnsFeedDictCreator(train_region_images, + train_ans_labels, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab).feed_dict _, current_train_batch_acc, y_avg_eval, loss_eval = \ diff --git a/classifiers/inherit_example.py b/classifiers/inherit_example.py new file mode 100644 index 0000000..095345a --- /dev/null +++ b/classifiers/inherit_example.py @@ -0,0 +1,14 @@ +class baseclass(): + def __init__(self, a): + print a + + def baseMethod(self): + print 'Yeah inheritance' + +class derivedclass(baseclass): + def __init__(self, a, b): + baseclass.__init__(self, a) + print b + self.baseMethod() + +a = derivedclass(1,2) diff --git a/classifiers/region_ranker/eval_rel_classifier.py b/classifiers/region_ranker/eval_rel_classifier.py index 49046f5..0cc92cb 100644 --- a/classifiers/region_ranker/eval_rel_classifier.py +++ b/classifiers/region_ranker/eval_rel_classifier.py @@ -17,6 +17,7 @@ def eval(eval_params): sess = tf.InteractiveSession() train_anno_filename = eval_params['train_json'] test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] regions_anno_filename = eval_params['regions_json'] image_regions_dir = eval_params['image_regions_dir'] outdir = eval_params['outdir'] @@ -33,6 +34,7 @@ def eval(eval_params): qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) @@ -40,24 +42,24 @@ def eval(eval_params): # Create graph g = tf.get_default_graph() - image_regions, questions, y, keep_prob = \ + plholder_dict = \ graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, len(vocab), mode='gt') - placeholders = [image_regions, questions, y, keep_prob] + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - y_pred = graph_creator.rel_comp_graph(image_regions, questions, + y_pred = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, mode, keep_prob, len(vocab), batch_size) - # y_pred = graph_creator.rel_comp_graph(image_regions, questions, - # obj_feat, atr_feat, mode, - # keep_prob, len(vocab), batch_size) - # Restore model restorer = tf.train.Saver() if os.path.exists(model): @@ -76,11 +78,11 @@ def eval(eval_params): # Test Recall test_recall = rel_trainer.evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, - vocab, image_regions_dir, - mean_image, test_start_id, - test_set_size, batch_size, - placeholders, 75, 75, - test_batch_creator,verbose=True) + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + test_start_id, test_set_size, + batch_size, plholder_dict, + 75, 75, test_batch_creator,verbose=True) print('Test Rec: ' + str(test_recall)) diff --git a/classifiers/region_ranker/train_rel_classifier.py b/classifiers/region_ranker/train_rel_classifier.py index de0f046..a7f5931 100644 --- a/classifiers/region_ranker/train_rel_classifier.py +++ b/classifiers/region_ranker/train_rel_classifier.py @@ -39,34 +39,37 @@ def batch_recall(pred_scores, gt_scores, k): return batch_recall -def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, ans_vocab, vocab, - image_dir, mean_image, start_index, val_set_size, batch_size, - placeholders, img_height, img_width, batch_creator, verbose=False): +def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, parsed_q_dict, + ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, + batch_size, plholder_dict, img_height, img_width, batch_creator, + verbose=False): recall_at_k = 0 max_iter = int(math.floor(val_set_size/batch_size)) for i in xrange(max_iter): if verbose==True: print('Iter: ' + str(i+1) + '/' + str(max_iter)) - region_images, ans_labels, questions, \ - region_score_vec, partition= batch_creator \ + region_images, ans_labels, parsed_q, \ + region_scores_vec, partition= batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index+i*batch_size, batch_size, + parsed_q_dict, img_height, img_width, 3) - region_score = batch_creator.reshape_score(region_score_vec) + region_scores = batch_creator.reshape_score(region_scores_vec) - feed_dict = { - placeholders[0] : region_images, - placeholders[1] : questions, - placeholders[2] : region_score, - placeholders[3] : 1.0, - } + feed_dict = ans_io_helper \ + .RelFeedDictCreator(region_images, + parsed_q, + region_scores, + 1.0, + plholder_dict, + vocab).feed_dict region_score_pred_eval = region_score_pred.eval(feed_dict) recall_at_k += batch_recall(region_score_pred_eval, - region_score, -1) + region_scores, -1) recall_at_k /= max_iter @@ -77,6 +80,7 @@ def train(train_params): sess = tf.InteractiveSession() train_anno_filename = train_params['train_json'] test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] regions_anno_filename = train_params['regions_json'] image_dir = train_params['image_dir'] image_regions_dir = train_params['image_regions_dir'] @@ -89,6 +93,7 @@ def train(train_params): os.mkdir(outdir) qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) @@ -106,24 +111,24 @@ def train(train_params): # Create graph g = tf.get_default_graph() - image_regions, questions, y, keep_prob = \ + plholder_dict = \ graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, len(vocab), mode='gt') - placeholders = [image_regions, questions, y, keep_prob] + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - y_pred = graph_creator.rel_comp_graph(image_regions, questions, + y_pred = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, mode, keep_prob, len(vocab), batch_size) - # y_pred = graph_creator.rel_comp_graph(image_regions, questions, - # obj_feat, atr_feat, mode, - # keep_prob, len(vocab), batch_size) - accuracy = graph_creator.evaluation(y, y_pred) cross_entropy = graph_creator.loss(y, y_pred) @@ -139,6 +144,7 @@ def train(train_params): 'rel/fc1/W_q', 'rel/fc1/W_obj', 'rel/fc1/W_atr', + 'rel/fc1/W_explt', 'rel/fc1/b', 'rel/fc2/W', 'rel/fc2/b', @@ -161,6 +167,7 @@ def train(train_params): vars_dict['rel/fc1/W_q'], vars_dict['rel/fc1/W_obj'], vars_dict['rel/fc1/W_atr'], + vars_dict['rel/fc1/W_explt'], vars_dict['rel/fc2/W'], ] @@ -244,12 +251,12 @@ def train(train_params): # Check accuracy of restored model if train_params['fine_tune']==True: - restored_recall = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, + restored_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, val_start_id, val_set_size, batch_size, - placeholders, 75, 75, + plholder_dict, 75, 75, val_batch_creator) print('Recall of restored model: ' + str(restored_recall)) @@ -261,23 +268,26 @@ def train(train_params): train_batch_creator.shuffle_ids() for i in range(max_iter): - train_region_images, train_ans_labels, train_questions, \ + train_region_images, train_ans_labels, train_parsed_q, \ train_region_score_vec, train_partition= train_batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_regions_dir, mean_image, - 1+i*batch_size, batch_size, + 1+i*batch_size, batch_size, + parsed_q_dict, 75, 75, 3) + train_region_score = train_batch_creator \ .reshape_score(train_region_score_vec) - feed_dict_train = { - image_regions : train_region_images, - questions: train_questions, - keep_prob: 0.5, - y: train_region_score, - } - + feed_dict_train = ans_io_helper \ + .RelFeedDictCreator(train_region_images, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab).feed_dict + _, current_train_batch_acc, y_pred_eval, loss_eval = \ sess.run([train_step, accuracy, y_pred, total_loss], feed_dict=feed_dict_train) @@ -289,23 +299,23 @@ def train(train_params): train_region_score, -1) if (i+1)%500==0: - val_recall = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, vocab, + val_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, val_start_id, val_set_size_small, - batch_size, placeholders, 75, 75, + batch_size, plholder_dict, 75, 75, val_small_batch_creator) print('Iter: ' + str(i+1) + ' Val Sm Rec: ' + str(val_recall)) train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] / max_iter val_rec_array_epoch[epoch] = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, - vocab, image_regions_dir, - mean_image, val_start_id, - val_set_size, batch_size, - placeholders, 75, 75, - val_batch_creator) + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size, + batch_size, plholder_dict, + 75, 75, val_batch_creator) print('Val Rec: ' + str(val_rec_array_epoch[epoch]) + ' Train Rec: ' + str(train_rec_array_epoch[epoch])) diff --git a/classifiers/tf_graph_creation_helper.py b/classifiers/tf_graph_creation_helper.py index a60f2e2..0a8c93f 100644 --- a/classifiers/tf_graph_creation_helper.py +++ b/classifiers/tf_graph_creation_helper.py @@ -70,17 +70,33 @@ def placeholder_inputs(mode = 'gt'): def placeholder_inputs_rel(num_proposals, total_vocab_size, mode = 'gt'): - image_regions = tf.placeholder(tf.float32, shape=[None,25,25,3]) - keep_prob = tf.placeholder(tf.float32) - questions = tf.placeholder(tf.float32, shape=[None,total_vocab_size]) + plholder_dict = { + 'image_regions': tf.placeholder(tf.float32, [None,25,25,3], + 'image_regions'), + 'keep_prob': tf.placeholder(tf.float32, name='keep_prob'), + } + for i in xrange(4): + bin_name = 'bin' + str(i) + plholder_dict[bin_name + '_shape'] = \ + tf.placeholder(tf.int64, [2], bin_name + '_shape') + plholder_dict[bin_name + '_indices'] = \ + tf.placeholder(tf.int64, [None, 2], bin_name + '_indices') + plholder_dict[bin_name + '_values'] = \ + tf.placeholder(tf.int64, [None], bin_name + '_values') + plholder_dict[bin_name + '_obj_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_objects']], + bin_name + '_obj_cont') + plholder_dict[bin_name + '_atr_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_attributes']], + bin_name + '_atr_cont') if mode == 'gt': print 'Creating placeholder for ground truth' - y = tf.placeholder(tf.float32, - shape=[None, ans_io_helper.num_proposals]) - return (image_regions, questions, y, keep_prob) + plholder_dict['gt_scores'] = tf.placeholder(tf.float32,\ + shape=[None, ans_io_helper.num_proposals], name = 'gt_scores') + return plholder_dict if mode == 'no_gt': print 'No placeholder for ground truth' - return (image_regions, questions, keep_prob) + return plholder_dict def placeholder_inputs_ans(total_vocab_size, ans_vocab_size, mode='gt'): @@ -204,16 +220,58 @@ def atr_comp_graph(x, keep_prob, obj_feat): return y_pred -def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, - mode, keep_prob, vocab_size, batch_size): +def q_bin_embed_graph(bin_name, word_vecs, plholder_dict): + indices = plholder_dict[bin_name + '_indices'] + values = plholder_dict[bin_name + '_values'] + shape = plholder_dict[bin_name + '_shape'] + sp_ids = tf.SparseTensor(indices, values, shape) + return tf.nn.embedding_lookup_sparse(word_vecs, sp_ids, None, + name=bin_name + '_embedding') + + +def explicit_feat_graph(bin_name, classifier_prob, + classifier_type, plholder_dict): + cont_plholder_name = bin_name + '_' + classifier_type + '_cont' + feat_name = 'explt_' + bin_name + '_' + classifier_type + dot_product = tf.mul(classifier_prob, plholder_dict[cont_plholder_name]) + return tf.reduce_mean(dot_product, 1, keep_dims=True, name=feat_name) + + +def rel_comp_graph(plholder_dict, obj_feat, atr_feat, + obj_prob, atr_prob, mode, keep_prob, + vocab_size, batch_size): + image_regions = plholder_dict['image_regions'] with tf.name_scope('rel') as rel_graph: with tf.name_scope('word_embed') as q_embed: word_vecs = weight_variable([vocab_size, graph_config['word_vec_dim']], var_name='word_vecs') - q_feat = tf.matmul(questions, word_vecs, name='q_feat') + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, [bin0_embed, + bin1_embed, + bin2_embed, + bin3_embed], name='q_feat') + + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + concat_explt_feat_dim = concat_explt_feat.get_shape()[1].value + print('Concatenate explicit feature dimension: ' + \ + str(concat_explt_feat_dim)) with tf.name_scope('conv1') as conv1: W_conv1 = weight_variable([5,5,3,4]) @@ -248,47 +306,57 @@ def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, print 'Atr feat dim: {}'.format(atr_feat_dim) W_reg_fc1 = weight_variable([reg_feat_dim, fc1_dim], var_name='W_reg') - W_q_fc1 = weight_variable([graph_config['word_vec_dim'], + W_q_fc1 = weight_variable([graph_config['q_embed_dim'], fc1_dim], var_name='W_q') W_obj_fc1 = weight_variable([obj_feat_dim, fc1_dim], var_name='W_obj') W_atr_fc1 = weight_variable([atr_feat_dim, fc1_dim], var_name='W_atr') + W_explt_fc1 = weight_variable([concat_explt_feat_dim, + fc1_dim], var_name='W_explt') b_fc1 = bias_variable([fc1_dim]) a_reg_fc1 = tf.matmul(reg_feat, W_reg_fc1, name='a_reg_fc1') a_q_fc1 = tf.matmul(q_feat, W_q_fc1, name='a_q_fc1') a_obj_fc1 = tf.matmul(obj_feat, W_obj_fc1, name='a_obj_fc1') a_atr_fc1 = tf.matmul(atr_feat, W_atr_fc1, name='a_atr_fc1') - + a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, + name='a_explt_fc1') coeff = { 'reg': 0.0, 'q': 0.0, 'obj': 0.0, 'atr': 0.0, + 'explt': 0.0, } - if mode=='q_reg': + if mode=='q_reg_explt': print mode - coeff['reg'] = 1/2.0 - coeff['q'] = 1/2.0 + coeff['reg'] = 1/3.0 + coeff['q'] = 1/3.0 + coeff['explt'] = 1/3.0 - elif mode=='q_obj_atr': + elif mode=='q_obj_atr_explt': print mode - coeff['q'] = 1/3.0 - coeff['obj'] = 1/3.0 - coeff['atr'] = 1/3.0 + coeff['q'] = 0.1 + coeff['obj'] = 0.1 + coeff['atr'] = 0.1 + coeff['explt'] = 0.7 - elif mode=='q_obj_atr_reg': + elif mode=='q_obj_atr_reg_explt': print mode - coeff['q'] = 1/4.0 - coeff['obj'] = 1/4.0 - coeff['atr'] = 1/4.0 - coeff['reg'] = 1/4.0 + coeff['q'] = 0.05 + coeff['obj'] = 0.05 + coeff['atr'] = 0.05 + coeff['reg'] = 0.05 + coeff['explt'] = 0.8 + + elif mode=='explt': + coeff['explt'] = 1.0 a_fc1 = coeff['reg']*a_reg_fc1 + coeff['q']*a_q_fc1 + \ coeff['obj']*a_obj_fc1 + coeff['atr']*a_atr_fc1 + \ - b_fc1 + coeff['explt']*a_explt_fc1 + b_fc1 h_fc1 = tf.nn.relu(a_fc1, name='h') h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='h_drop') @@ -306,16 +374,7 @@ def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, y_pred = tf.nn.softmax(logits, name='softmax') return y_pred - - -def q_bin_embed_graph(bin_name, word_vecs, plholder_dict): - indices = plholder_dict[bin_name + '_indices'] - values = plholder_dict[bin_name + '_values'] - shape = plholder_dict[bin_name + '_shape'] - sp_ids = tf.SparseTensor(indices, values, shape) - return tf.nn.embedding_lookup_sparse(word_vecs, sp_ids, None, - name=bin_name + '_embedding') - + def ans_comp_graph(plholder_dict, obj_feat, atr_feat, vocab, inv_vocab, ans_vocab_size, mode): @@ -426,6 +485,160 @@ def ans_comp_graph(plholder_dict, obj_feat, atr_feat, return y_pred +def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, + vocab, inv_vocab, ans_vocab, mode): + vocab_size = len(vocab) + image_regions = plholder_dict['image_regions'] + keep_prob = plholder_dict['keep_prob'] + ans_vocab_size = len(ans_vocab) + + inv_ans_vocab = {v:k for k, v in ans_vocab.items()} + ans_in_vocab_ids_list = [] + for i in xrange(ans_vocab_size): + ans_in_vocab_ids_list.append(vocab[inv_ans_vocab[i]]) + + ans_in_vocab_ids_tensor = tf.constant(ans_in_vocab_ids_list, dtype=tf.int64) + + with tf.name_scope('ans') as ans_graph: + + with tf.name_scope('word_embed') as word_embed: + + word_vecs = weight_variable([vocab_size, + graph_config['word_vec_dim']], + var_name='word_vecs') + + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, [bin0_embed, + bin1_embed, + bin2_embed, + bin3_embed], name='q_feat') + + ans_embed = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_list, + name='ans_embed') + + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + concat_explt_feat_dim = concat_explt_feat.get_shape()[1].value + print('Concatenate explicit feature dimension: ' + \ + str(concat_explt_feat_dim)) + + with tf.name_scope('conv1') as conv1: + num_filters_conv1 = 4 + W_conv1 = weight_variable([5,5,3,num_filters_conv1]) + b_conv1 = bias_variable([num_filters_conv1]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, keep_prob, name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + num_filters_conv2 = 8 + W_conv2 = weight_variable([3,3,num_filters_conv1,num_filters_conv2]) + b_conv2 = bias_variable([num_filters_conv2]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, keep_prob, name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + region_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in h_pool2_drop_shape[1:]]) + region_feat = tf.reshape(h_pool2_drop, [-1, region_feat_dim], + name='region_feat') + + print('Region feature dimension: ' + str(region_feat_dim)) #392 + + with tf.name_scope('fc1') as fc1: + + fc1_dim = graph_config['ans_fc1_dim'] + W_region_fc1 = weight_variable([region_feat_dim, + fc1_dim], var_name='W_region') + W_obj_fc1 = weight_variable([graph_config['obj_feat_dim'], + fc1_dim], var_name='W_obj') + W_atr_fc1 = weight_variable([graph_config['atr_feat_dim'], + fc1_dim], var_name='W_atr') + W_q_fc1 = weight_variable([graph_config['q_embed_dim'], + fc1_dim], var_name='W_q') + W_explt_fc1 = weight_variable([concat_explt_feat_dim, + fc1_dim], var_name='W_explt') + b_fc1 = bias_variable([fc1_dim]) + + a_fc1_region = tf.matmul(region_feat, W_region_fc1, + name='a_fc1_region') + a_fc1_obj = tf.matmul(obj_feat, W_obj_fc1, name='a_fc1_obj') + a_fc1_atr = tf.matmul(atr_feat, W_atr_fc1, name='a_fc1_atr') + a_fc1_q = tf.matmul(q_feat, W_q_fc1, name='a_fc1_q') + a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, + name='a_explt_fc1') + coeff_reg = 0.0 + coeff_obj = 0.0 + coeff_atr = 0.0 + coeff_q = 0.0 + coeff_explt = 0.0 + + if mode=='q': + coeff_q = 1.0 + + elif mode=='q_reg': + coeff_q = 1/2.0 + coeff_reg = 1/2.0 + + elif mode=='q_obj_atr': + coeff_q = 1/4.0 + coeff_obj = 1/4.0 + coeff_atr = 1/4.0 + coeff_explt = 1/4.0 + + elif mode=='q_obj_atr_reg': + coeff_q = 1/5.0 + coeff_obj = 1/5.0 + coeff_atr = 1/5.0 + coeff_reg = 1/5.0 + coeff_explt = 1/5.0 + + a_fc1 = coeff_reg * a_fc1_region + \ + coeff_obj * a_fc1_obj + \ + coeff_atr * a_fc1_atr + \ + coeff_q * a_fc1_q + + h_fc1 = tf.nn.relu(a_fc1, name='h') + h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='h_drop') + + with tf.name_scope('fc2') as fc2: + W_feat_fc2 = weight_variable([fc1_dim, + graph_config['word_vec_dim']], + var_name='W_feat') + b_feat_fc2 = bias_variable([graph_config['word_vec_dim']], + var_name='b_feat') + W_ans_fc2 = weight_variable([graph_config['word_vec_dim'], + graph_config['word_vec_dim']], + var_name='W_ans') + b_ans_fc2 = bias_variable([graph_config['word_vec_dim']], + var_name='b_ans') + comb_feat_embed = tf.add(tf.matmul(h_fc1_drop, W_feat_fc2), + b_feat_fc2, + name='comb_feat_embed') + comb_ans_embed = tf.add(tf.matmul(ans_embed, W_ans_fc2), + b_ans_fc2, + name='comb_feat_embed') + ans_scores = tf.matmul(comb_feat_embed, tf.transpose(comb_ans_embed), + name='ans_scores') + ans_scores = tf.nn.l2_normalize(ans_scores, 1)*3.0 + return tf.nn.softmax(ans_scores) + + def aggregate_y_pred(y_pred, region_score, batch_size, num_proposals, ans_vocab_size): y_pred_list = tf.split(0, batch_size, y_pred) @@ -453,6 +666,12 @@ def loss(y, y_pred): return tf.truediv(cross_entropy, tf.cast(batch_size[0],tf.float32)) +def margin_loss(y, y_pred, margin): + correct_score = tf.reduce_sum(tf.mul(y, y_pred), 1, + keep_dims=True, name='correct_score') + return tf.reduce_mean(tf.maximum(0.0, y + margin - correct_score)) + + def regularize_params(param_list): regularizer = tf.zeros(shape=[]) for param in param_list: diff --git a/classifiers/train_classifiers.py b/classifiers/train_classifiers.py index 015c6eb..b8972de 100644 --- a/classifiers/train_classifiers.py +++ b/classifiers/train_classifiers.py @@ -65,16 +65,17 @@ rel_classifier_train_params = { 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt', 'obj_atr_model': '/home/tanmay/Code/GenVQA/Exp_Results/Atr_Classifier/obj_atr_classifier-1', - 'mode': 'q_obj_atr', - 'adam_lr' : 0.001, + 'mode': 'q_obj_atr_reg_explt', + 'adam_lr' : 0.0001, 'crop_n_save_regions': False, - 'max_epoch': 5, + 'max_epoch': 10, 'batch_size': 10, - 'fine_tune': False, + 'fine_tune': True, 'start_model': 4, # Used only if fine_tune is True } @@ -82,12 +83,13 @@ rel_classifier_eval_params = { 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', - 'model_basedir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', - 'model_number': 4, - 'mode': 'q_obj_atr', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt', + 'model_basedir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt', + 'model_number': 9, + 'mode': 'q_obj_atr_reg_explt', 'batch_size': 20, 'test_start_id': 94645, 'test_set_size': 143495-94645+1, @@ -100,15 +102,15 @@ ans_classifier_train_params = { 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel', - 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier/rel_classifier_q_obj_atr-4', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt/rel_classifier_q_obj_atr_reg_explt-9', 'obj_atr_model': '/home/tanmay/Code/GenVQA/Exp_Results/Atr_Classifier/obj_atr_classifier-1', - 'adam_lr' : 0.001, - 'mode' : 'q', + 'adam_lr' : 0.0001, + 'mode' : 'q_obj_atr', 'crop_n_save_regions': False, - 'max_epoch': 5, + 'max_epoch': 10, 'batch_size': 10, - 'fine_tune': False, + 'fine_tune': True, 'start_model': 4, # When fine_tune is false used to pre-initialize q_obj_atr with q model etc } -- GitLab