diff --git a/classifiers/ans_graph_creator.py b/classifiers/ans_graph_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..91fe8e5aa60c5087191be0e82b61b8db99ef2d20 --- /dev/null +++ b/classifiers/ans_graph_creator.py @@ -0,0 +1,172 @@ +import numpy as np +import math +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +from tf_graph_creation_helper import weight_variable, bias_variable, \ + q_bin_embed_graph, conv2d, max_pool_2x2, batchnorm + + +class ans_graph_creator(): + def __init__(self, + plholder_dict, + obj_feat, + atr_feat, + obj_prob, + atr_prob, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_config, + mode='q_obj_atr', + is_train=True): + + self.mode = mode + self.is_train = plholder_dict['is_train'] + self.keep_prob = plholder_dict['keep_prob'] + image_regions = plholder_dict['image_regions'] + vocab_size = len(vocab) + + with tf.name_scope('ans') as ans_graph: + # Word Vectors + word_vecs = self.create_word_vecs(vocab_size, + graph_config['word_vec_dim']) + + # Feature Computations + q_feat = self.add_q_feat_comp_layer(word_vecs, plholder_dict) + reg_feat = self.add_reg_feat_comp_layer(image_regions) + + # Feature Projections (with batch norm) + feat_proj_dim = graph_config['joint_embed_dim'] + proj_feat = dict() + + proj_feat['q'] = self.fc_layer(q_feat, feat_proj_dim, + 'q_feat_proj_layer') + + proj_feat['reg'] = self.fc_layer(reg_feat, feat_proj_dim, + 'reg_feat_proj_layer') + + proj_feat['obj'] = self.fc_layer(obj_feat, feat_proj_dim, + 'obj_feat_proj_layer') + + proj_feat['atr'] = self.fc_layer(atr_feat, feat_proj_dim, + 'atr_feat_proj_layer') + + # Feature Combination + coeffs = self.mixing_coeffs() + print coeffs + num_regions = batch_size*ans_io_helper.num_proposals + comb_feat = tf.zeros(shape=[num_regions, feat_proj_dim], + dtype=tf.float32) + for feat_type, feat in proj_feat.items(): + comb_feat = comb_feat + feat * coeffs[feat_type] + + # Answer feature + ans_feat = self.compute_ans_feat(word_vecs, vocab, ans_vocab) + + # Proj answer + proj_ans_feat = self.fc_layer(ans_feat, feat_proj_dim, + 'ans_feat_proj_layer') + + # Compute Cosine Distance + self.cosine_dist = self.compute_cosine_dist(comb_feat, + proj_ans_feat) + + def create_word_vecs(self, vocab_size, word_vec_dim): + word_vecs = weight_variable([vocab_size, + word_vec_dim], + var_name='word_vecs') + word_vecs = tf.nn.l2_normalize(word_vecs, 1) + tf.add_to_collection('regularize',word_vecs) + return word_vecs + + def add_q_feat_comp_layer(self, word_vecs, plholder_dict): + with tf.name_scope('q_feat_comp_layer') as q_feat_comp_layer: + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, + [bin0_embed, bin1_embed, bin2_embed, bin3_embed], + name='q_feat') + return q_feat + + def add_reg_feat_comp_layer(self, image_regions): + with tf.name_scope('reg_feat_comp_layer') as reg_feat_comp_layer: + with tf.name_scope('conv1') as conv1: + W_conv1 = weight_variable([5,5,3,4]) + b_conv1 = bias_variable([4]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), + b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, self.keep_prob, + name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + W_conv2 = weight_variable([3,3,4,8]) + b_conv2 = bias_variable([8]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, self.keep_prob, + name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + reg_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in + h_pool2_drop_shape[1:]]) + reg_feat = tf.reshape(h_pool2_drop, [-1, reg_feat_dim], + name='reg_feat') + + tf.add_to_collection('regularize', W_conv1) + tf.add_to_collection('regularize', W_conv2) + + return reg_feat + + def fc_layer(self, feat, proj_dim, name_scope): + with tf.name_scope(name_scope) as fc_layer: + feat_dim = feat.get_shape()[1].value + W1 = weight_variable([feat_dim, proj_dim]) + b1 = bias_variable([proj_dim]) + proj_feat = tf.add(tf.matmul(feat, W1), b1) + bn_proj_feat = batchnorm(proj_feat, None, self.is_train) + W2 = weight_variable([proj_dim, proj_dim]) + b2 = bias_variable([proj_dim]) + bn_proj_feat = tf.add(tf.matmul(tf.nn.relu(bn_proj_feat), W2), b2) + tf.add_to_collection('regularize', W1) + tf.add_to_collection('regularize', W2) + + return bn_proj_feat + + def mixing_coeffs(self): + feat_types = ['q', 'obj', 'atr', 'reg'] + coeffs = dict() + count = 0; + for feat_type in feat_types: + if feat_type in self.mode: + coeffs[feat_type] = 1.0 + count += 1 + else: + coeffs[feat_type] = 0.0 + coeffs = {k: v/count for k, v in coeffs.items()} + return coeffs + + def compute_ans_feat(self, word_vecs, vocab, ans_vocab): + ans_vocab_size = len(ans_vocab) + inv_ans_vocab = {v:k for k, v in ans_vocab.items()} + ans_in_vocab_ids_list = [] + for i in xrange(ans_vocab_size): + ans_in_vocab_ids_list.append(vocab[inv_ans_vocab[i]]) + + ans_in_vocab_ids_tensor = tf.constant(ans_in_vocab_ids_list, + dtype=tf.int64) + ans_feat = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_tensor, + name='ans_feat') + return ans_feat + + def compute_cosine_dist(self, feat1, feat2): + feat1 = tf.nn.l2_normalize(feat1, 1) + feat2 = tf.nn.l2_normalize(feat2, 1) + return tf.matmul(feat1, tf.transpose(feat2)) + diff --git a/classifiers/answer_classifier/ans_data_io_helper.py b/classifiers/answer_classifier/ans_data_io_helper.py index d5e419b4919c0fab8672ce044135fb9bc3feb550..b6b51eb123140dc5290ea4f82c4b2bea4428df62 100644 --- a/classifiers/answer_classifier/ans_data_io_helper.py +++ b/classifiers/answer_classifier/ans_data_io_helper.py @@ -309,11 +309,12 @@ class RelFeedDictCreator(FeedDictCreator): class AnsFeedDictCreator(FeedDictCreator): def __init__(self, region_images, ans_labels, parsed_q, - region_scores, keep_prob, plholder_dict, vocab): + region_scores, keep_prob, plholder_dict, vocab, is_train): FeedDictCreator.__init__(self, region_images, parsed_q, keep_prob, plholder_dict, vocab) self.feed_dict[plholder_dict['gt_answer']] = ans_labels self.feed_dict[plholder_dict['region_score']] = region_scores + self.feed_dict[plholder_dict['is_train']] = is_train class html_ans_table_writer(): diff --git a/classifiers/answer_classifier/eval_ans_classifier_simple.py b/classifiers/answer_classifier/eval_ans_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..d50ad5db3cc1229dcc94f67410f5660395f5105d --- /dev/null +++ b/classifiers/answer_classifier/eval_ans_classifier_simple.py @@ -0,0 +1,314 @@ +import sys +import os +import json +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import math +import random +import pdb +import tensorflow as tf +import tf_graph_creation_helper as graph_creator +import ans_graph_creator +import plot_helper as plotter +import ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import train_ans_classifier as ans_trainer +from PIL import Image, ImageDraw + +def get_pred(y, qa_anno_dict, region_anno_dict, parsed_q_dict, ans_vocab, vocab, + image_dir, mean_image, start_index, val_set_size, batch_size, + plholder_dict, img_height, img_width, batch_creator): + + inv_ans_vocab = {v: k for k, v in ans_vocab.items()} + pred_list = [] + correct = 0 + max_iter = int(math.ceil(val_set_size*1.0/batch_size)) + batch_size_tmp = batch_size + for i in xrange(max_iter): + if i==(max_iter-1): + batch_size_tmp = val_set_size - i*batch_size + + print('Iter: ' + str(i+1) + '/' + str(max_iter)) + + region_images, ans_labels, parsed_q, \ + region_score, partition = batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, + region_anno_dict, + ans_vocab, vocab, + image_dir, mean_image, + start_index+i*batch_size, + batch_size_tmp, parsed_q_dict, + img_height, img_width, 3) + + if i==max_iter-1: + + residual_batch_size = batch_size - batch_size_tmp + residual_regions = residual_batch_size*ans_io_helper.num_proposals + + residual_region_images = np.zeros(shape=[residual_regions, + img_height/3, img_width/3, + 3]) + # residual_questions = np.zeros(shape=[residual_regions, + # len(vocab)]) + + residual_ans_labels = np.zeros(shape=[residual_batch_size, + len(ans_vocab)]) + residual_region_score = np.zeros(shape=[1, residual_regions]) + + region_images = np.concatenate((region_images, + residual_region_images), + axis=0) +# questions = np.concatenate((questions, residual_questions), axis=0) + for k in xrange(batch_size_tmp*22, batch_size*22): + parsed_q[k] = { + 'bin0': [''], + 'bin1': [''], + 'bin2': [''], + 'bin3': [''], + } + + ans_labels = np.concatenate((ans_labels, residual_ans_labels), + axis=0) + region_score = np.concatenate((region_score, residual_region_score), + axis=1) + + + feed_dict = ans_io_helper \ + .AnsFeedDictCreator(region_images, + ans_labels, + parsed_q, + region_score, + 1.0, + plholder_dict, + vocab, + False).feed_dict + + ans_ids = np.argmax(y.eval(feed_dict), 1) + for j in xrange(batch_size_tmp): + pred_list = pred_list + [{ + 'question_id' : start_index+i*batch_size+j, + 'answer' : inv_ans_vocab[ans_ids[j]] + }] + + return pred_list + +def eval(eval_params): + sess = tf.InteractiveSession() + + train_anno_filename = eval_params['train_json'] + test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] + regions_anno_filename = eval_params['regions_json'] + image_regions_dir = eval_params['image_regions_dir'] + outdir = eval_params['outdir'] + model = eval_params['model'] + batch_size = eval_params['batch_size'] + test_start_id = eval_params['test_start_id'] + test_set_size = eval_params['test_set_size'] + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) + qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) + + # Create graph + g = tf.get_default_graph() + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + 'q_obj_atr_reg', + 1.0, len(vocab), batch_size) + + ans_graph = ans_graph_creator.ans_graph_creator(plholder_dict, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_creator.graph_config, + eval_params['mode'], + True) + + y_pred = ans_graph.cosine_dist + + pred_rel_score_vec = tf.reshape(pred_rel_score, + [1, batch_size*ans_io_helper.num_proposals]) + + y_avg = graph_creator.aggregate_y_pred(y_pred, region_score, + batch_size, + ans_io_helper.num_proposals, + len(ans_vocab)) + + accuracy = graph_creator.evaluation(y, y_avg) + + # Collect variables + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + + # Restore model + restorer = tf.train.Saver() + if os.path.exists(model): + restorer.restore(sess, model) + else: + print 'Failed to read model from file ' + model + +# sess.run(tf.initialize_variables(vars_to_init)) + + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Batch creator + test_batch_creator = ans_io_helper.batch_creator(test_start_id, + test_start_id + + test_set_size - 1) + # Get predictions + pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, + vocab, image_regions_dir, mean_image, test_start_id, + test_set_size, batch_size, plholder_dict, 75, 75, + test_batch_creator) + + json_filename = os.path.join(outdir, 'predicted_ans_' + \ + eval_params['mode'] + '.json') + with open(json_filename,'w') as json_file: + json.dump(pred_dict, json_file) + + +def create_html_file(outdir, test_anno_filename, regions_anno_filename, + pred_json_filename, image_dir, num_pred_to_display, mode): + qa_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + + with open(pred_json_filename,'r') as json_file: + raw_data = json.load(json_file) + + # Create director for storing images with region boxes + images_bbox_dir = os.path.join(outdir, 'images_bbox' + '_' + mode) + if not os.path.exists(images_bbox_dir): + os.mkdir(images_bbox_dir) + + col_dict = { + 0 : 'Question_Id', + 1 : 'Question', + 2 : 'Answer (GT)', + 3 : 'Answer (Pred)', + 4 : 'Image', + } + html_correct_filename = os.path.join(outdir, + 'correct_ans_' + mode + '.html') + html_writer_correct = ans_io_helper \ + .html_ans_table_writer(html_correct_filename) + html_writer_correct.add_element(col_dict) + + html_incorrect_filename = os.path.join(outdir, + 'incorrect_ans_' + mode + '.html') + html_writer_incorrect = ans_io_helper \ + .html_ans_table_writer(html_incorrect_filename) + html_writer_incorrect.add_element(col_dict) + + region_coords, region_coords_ = region_proposer.get_region_coords(300,300) + + random.shuffle(raw_data) + + count = 0 + for entry in raw_data: + if count == num_pred_to_display: + break + q_id = entry['question_id'] + pred_ans = entry['answer'] + gt_ans = qa_dict[q_id].answer + question = qa_dict[q_id].question + img_id = qa_dict[q_id].image_id + image_filename = os.path.join(image_dir, str(img_id) + '.jpg') + image = Image.open(image_filename) + + regions = region_proposer.rank_regions(image, question, region_coords, + region_coords_, + region_anno_dict[img_id], + crop=False) + dr = ImageDraw.Draw(image) + # print(q_id) + # print([regions[key].score for key in regions.keys()]) + for i in xrange(ans_io_helper.num_proposals): + if not regions[i].score==0: + coord = regions[i].coord + x1 = coord[0] + y1 = coord[1] + x2 = coord[2] + y2 = coord[3] + dr.rectangle([(x1,y1),(x2,y2)], outline="red") + + image_bbox_filename = os.path.join(images_bbox_dir,str(q_id) + '.jpg') + image.save(image_bbox_filename) + image_bbox_filename_rel = 'images_bbox_'+ mode +'/'+ str(q_id) + '.jpg' + col_dict = { + 0 : q_id, + 1 : question, + 2 : gt_ans, + 3 : pred_ans, + 4 : html_writer_correct.image_tag(image_bbox_filename_rel,50,50) + } + if pred_ans==gt_ans: + html_writer_correct.add_element(col_dict) + else: + html_writer_incorrect.add_element(col_dict) + + count += 1 + + html_writer_correct.close_file() + html_writer_incorrect.close_file() + + +if __name__=='__main__': + mode = 'q_obj_atr' + model_num = 4 + ans_classifier_eval_params = { + 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', + 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', + 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', + 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', + 'image_regions_dir': '/mnt/ramdisk/image_regions', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt/rel_classifier_q_obj_atr_reg_explt-9', + 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin/ans_classifier_' + mode + '-' + str(model_num), + 'mode' : mode, + 'batch_size': 20, + 'test_start_id': 94645, + 'test_set_size': 143495-94645+1, + } + + eval(ans_classifier_eval_params) + outdir = ans_classifier_eval_params['outdir'] + test_anno_filename = ans_classifier_eval_params['test_json'] + regions_anno_filename = ans_classifier_eval_params['regions_json'] + pred_json_filename = os.path.join(outdir, 'predicted_ans_'+ mode +'.json') + image_dir = ans_classifier_eval_params['image_dir'] + create_html_file(outdir, test_anno_filename, regions_anno_filename, + pred_json_filename, image_dir, 1000, mode) diff --git a/classifiers/answer_classifier/train_ans_classifier.py b/classifiers/answer_classifier/train_ans_classifier.py index f57515c5e33ff8a270ea621aa058689464e09e5c..4b4f1ca8a89da827f6ca65f02794bf0bcd831823 100644 --- a/classifiers/answer_classifier/train_ans_classifier.py +++ b/classifiers/answer_classifier/train_ans_classifier.py @@ -36,7 +36,7 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): 'ans/fc2/W_feat', 'ans/fc2/b_feat', 'ans/fc2/W_ans', - 'ans/fc2/b_ans' + 'ans/fc2/b_ans', ] vars_dict = graph_creator.get_list_of_variables(list_of_vars) @@ -237,7 +237,9 @@ def train(train_params): obj_feat, atr_feat, y_pred_obj, y_pred_atr, vocab, inv_vocab, ans_vocab, - train_params['mode']) + train_params['mode'], True) + + pdb.set_trace() pred_rel_score_vec = tf.reshape(pred_rel_score, [1, batch_size*ans_io_helper.num_proposals]) @@ -246,8 +248,8 @@ def train(train_params): ans_io_helper.num_proposals, len(ans_vocab)) -# cross_entropy = graph_creator.loss(y, y_avg) - margin_loss = graph_creator.margin_loss(y, y_avg, 0.2) + cross_entropy = graph_creator.loss(y, y_avg) + #margin_loss = graph_creator.margin_loss(y, y_avg, 1) accuracy = graph_creator.evaluation(y, y_avg) # Collect variables @@ -290,7 +292,7 @@ def train(train_params): # 1e-5 * regularizer_ans_fcs + \ # 1e-3 * regularizer_ans_filters - total_loss = margin_loss + \ + total_loss = cross_entropy + \ 1e-5 * regularizer_ans_word_vecs + \ 1e-5 * regularizer_ans_fcs + \ 1e-3 * regularizer_ans_filters diff --git a/classifiers/answer_classifier/train_ans_classifier_simple.py b/classifiers/answer_classifier/train_ans_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..b787ac53a99796b9eddd406a53989afd03fa36b5 --- /dev/null +++ b/classifiers/answer_classifier/train_ans_classifier_simple.py @@ -0,0 +1,343 @@ +import sys +import os +import json +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import math +import random +import pdb +import tensorflow as tf +import object_classifiers.obj_data_io_helper as obj_data_loader +import attribute_classifiers.atr_data_io_helper as atr_data_loader +import tf_graph_creation_helper as graph_creator +import ans_graph_creator +import plot_helper as plotter +import ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import time + +val_start_id = 89645 +val_set_size = 5000 +val_set_size_small = 500 + + +def evaluate(accuracy, qa_anno_dict, region_anno_dict, ans_vocab, vocab, + image_dir, mean_image, start_index, val_set_size, batch_size, + plholder_dict, img_height, img_width, batch_creator, + parsed_q_dict): + + correct = 0 + max_iter = int(math.floor(val_set_size/batch_size)) + for i in xrange(max_iter): + region_images, ans_labels, parsed_q, \ + region_score, partition= batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, image_dir, mean_image, + start_index+i*batch_size, batch_size, + parsed_q_dict, + img_height, img_width, 3) + + feed_dict = ans_io_helper.\ + AnsFeedDictCreator(region_images, ans_labels, parsed_q, + region_score, 1.0, plholder_dict, + vocab, False).feed_dict + + correct = correct + accuracy.eval(feed_dict) + + return correct/max_iter + + +def train(train_params): + sess = tf.InteractiveSession() + + train_anno_filename = train_params['train_json'] + test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] + regions_anno_filename = train_params['regions_json'] + image_dir = train_params['image_dir'] + image_regions_dir = train_params['image_regions_dir'] + outdir = train_params['outdir'] + rel_model = train_params['rel_model'] + obj_atr_model = train_params['obj_atr_model'] + batch_size = train_params['batch_size'] + + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) +# vocab = ans_io_helper.join_vocab(vocab, ans_vocab) + + # Save region crops + if train_params['crop_n_save_regions'] == True: + qa_anno_dict_test = ans_io_helper.parse_qa_anno(test_anno_filename) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict, region_anno_dict, + 1, 94644, 75, 75) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict_test, region_anno_dict, + 94645, 143495-94645+1, 75, 75) + + # Create graph + g = tf.get_default_graph() + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + 'q_obj_atr_reg_explt', + 1.0, len(vocab), + batch_size) + + # Restore rel, obj and attribute classifier parameters + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + + obj_atr_saver = tf.train.Saver(obj_vars+atr_vars) + rel_saver = tf.train.Saver(rel_vars) + + rel_saver.restore(sess, rel_model) + obj_atr_saver.restore(sess, obj_atr_model) + + ans_graph = ans_graph_creator.ans_graph_creator(plholder_dict, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_creator.graph_config, + train_params['mode'], + True) + + y_pred = ans_graph.cosine_dist + + pred_rel_score_vec = tf.reshape(pred_rel_score, + [1, batch_size*ans_io_helper.num_proposals]) + + y_avg = graph_creator.aggregate_y_pred(y_pred, + pred_rel_score_vec, batch_size, + ans_io_helper.num_proposals, + len(ans_vocab)) + +# cross_entropy = graph_creator.loss(y, y_avg) + total_loss = graph_creator.margin_loss(y, y_avg, 0.1) + accuracy = graph_creator.evaluation(y, y_avg) + + # Collect variables + ans_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='ans') + vars_to_regularize = tf.get_collection('regularize') + + for var in vars_to_regularize: + print var.name + total_loss += 1e-4 * tf.nn.l2_loss(var) + + # Model to restore some of the weights from + if train_params['mode']=='q': + partial_model = '' + + elif train_params['mode']=='q_obj_atr' or \ + train_params['mode']=='q_reg': + partial_model = os.path.join(outdir, 'ans_classifier_q-' + \ + str(train_params['start_model'])) + + elif train_params['mode']=='q_obj_atr_reg': + partial_model = os.path.join(outdir, 'ans_classifier_q_obj_atr-' + \ + str(train_params['start_model'])) + + # Fine tune begining with a previous model + if train_params['fine_tune']==True: + partial_model = os.path.join(outdir, 'ans_classifier_' + \ + train_params['mode'] + '-' + \ + str(train_params['start_model'])) + start_epoch = train_params['start_model']+1 + + partial_restorer = tf.train.Saver() + else: + start_epoch = 0 + if train_params['mode']!='q': + partial_restorer = tf.train.Saver() + + # Restore partial model + if os.path.exists(partial_model): + partial_restorer.restore(sess, partial_model) + + # Save trained vars + model_saver = tf.train.Saver() + all_vars_without_optim = tf.all_variables() + + # Attach optimization ops + word_vecs = tf.get_collection('variables','ans/word_vecs') + # vars_to_train = [var for var in ans_vars if + # 'ans/word_vecs' not in var.name] + vars_to_train = ans_vars + train_step = tf.train.AdamOptimizer(train_params['adam_lr']) \ + .minimize(total_loss, var_list = vars_to_train) + + # Initialize vars_to_init + all_vars = tf.all_variables() + optimizer_vars = [var for var in all_vars if var not in + all_vars_without_optim] + + print('Optimizer Variables: ') + print([var.name for var in optimizer_vars]) + print('------------------') + + if train_params['mode']=='q': + vars_to_init = ans_vars + optimizer_vars + else: + vars_to_init = optimizer_vars + + sess.run(tf.initialize_variables(vars_to_init)) + + # Load mean image + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Start Training + max_epoch = train_params['max_epoch'] + max_iter = 4400*2 + val_acc_array_epoch = np.zeros([max_epoch]) + train_acc_array_epoch = np.zeros([max_epoch]) + + # Batch creators + train_batch_creator = ans_io_helper.batch_creator(1, max_iter*batch_size) + val_batch_creator = ans_io_helper.batch_creator(val_start_id, val_start_id + + val_set_size - 1) + val_small_batch_creator = ans_io_helper.batch_creator(val_start_id, + val_start_id + + val_set_size_small-1) + + # Check accuracy of restored model + # if train_params['fine_tune']==True: + # restored_accuracy = evaluate(accuracy, qa_anno_dict, + # region_anno_dict, ans_vocab, + # vocab, image_regions_dir, + # mean_image, val_start_id, + # val_set_size, batch_size, + # plholder_dict, 75, 75, + # val_batch_creator, + # parsed_q_dict) + # print('Accuracy of restored model: ' + str(restored_accuracy)) + + # Accuracy filename + train_accuracy_txtfile = os.path.join(outdir,'train_accuracy_' + \ + train_params['mode'] + '.txt') + val_accuracy_txtfile = os.path.join(outdir,'val_accuracy_' + \ + train_params['mode'] + '.txt') + + for epoch in range(start_epoch, max_epoch): + train_batch_creator.shuffle_ids() + for i in range(max_iter): + train_region_images, train_ans_labels, train_parsed_q, \ + train_region_score, train_partition= train_batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + 1+i*batch_size, batch_size, + parsed_q_dict, + 75, 75, 3) + + feed_dict_train = ans_io_helper \ + .AnsFeedDictCreator(train_region_images, + train_ans_labels, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab, + True).feed_dict + + _, current_train_batch_acc, y_avg_eval, loss_eval = \ + sess.run([train_step, accuracy, y_avg, total_loss], + feed_dict=feed_dict_train) + + # print(y_avg_eval[0,:]) + # print(train_ans_labels[0,:]) + +# rel_logits = g.get_operation_by_name('rel/fc2/vec_logits') +# print(rel_logits.outputs[0].eval(feed_dict_train)) +# print (pred_rel_score.eval(feed_dict_train)) + + assert (not np.any(np.isnan(y_avg_eval))), 'NaN predicted' + + train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] + \ + current_train_batch_acc + + if (i+1)%500==0: + val_accuracy = evaluate(accuracy, qa_anno_dict, + region_anno_dict, ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size_small, + batch_size, plholder_dict, 75, 75, + val_small_batch_creator, + parsed_q_dict) + + print('Iter: ' + str(i+1) + ' Val Sm Acc: ' + str(val_accuracy)) + + train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] / max_iter + val_acc_array_epoch[epoch] = evaluate(accuracy, qa_anno_dict, + region_anno_dict, ans_vocab, + vocab, image_regions_dir, + mean_image, val_start_id, + val_set_size, batch_size, + plholder_dict, 75, 75, + val_batch_creator, + parsed_q_dict) + + print('Val Acc: ' + str(val_acc_array_epoch[epoch]) + + ' Train Acc: ' + str(train_acc_array_epoch[epoch])) + + + if train_params['fine_tune']==True: + plot_path = os.path.join(outdir, 'acc_vs_epoch_' \ + + train_params['mode'] + '_fine_tuned.pdf') + else: + plot_path = os.path.join(outdir, 'acc_vs_epoch_' \ + + train_params['mode'] + '.pdf') + + plotter.write_accuracy_to_file(start_epoch, epoch, + train_acc_array_epoch, + train_params['fine_tune'], + train_accuracy_txtfile) + plotter.write_accuracy_to_file(start_epoch, epoch, + val_acc_array_epoch, + train_params['fine_tune'], + val_accuracy_txtfile) + plotter.plot_accuracies(xdata=np.arange(0, epoch + 1) + 1, + ydata_train=train_acc_array_epoch[0:epoch + 1], + ydata_val=val_acc_array_epoch[0:epoch + 1], + xlim=[1, max_epoch], ylim=[0, 1.0], + savePath=plot_path) + + save_path = model_saver.save(sess, + os.path.join(outdir, 'ans_classifier_' + \ + train_params['mode']), global_step=epoch) + + sess.close() + tf.reset_default_graph() + +if __name__=='__main__': + print 'Hello' diff --git a/classifiers/region_ranker/train_rel_classifier.py b/classifiers/region_ranker/train_rel_classifier.py index a7f593174f8a6772d0b62dcb41e952f356067643..6ae315b94099acbba0b66229ab17458d1aafe20e 100644 --- a/classifiers/region_ranker/train_rel_classifier.py +++ b/classifiers/region_ranker/train_rel_classifier.py @@ -67,6 +67,7 @@ def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, parsed_q_dict, vocab).feed_dict region_score_pred_eval = region_score_pred.eval(feed_dict) + print region_score_pred_eval recall_at_k += batch_recall(region_score_pred_eval, region_scores, -1) diff --git a/classifiers/region_ranker/train_rel_classifier_simple.py b/classifiers/region_ranker/train_rel_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae315b94099acbba0b66229ab17458d1aafe20e --- /dev/null +++ b/classifiers/region_ranker/train_rel_classifier_simple.py @@ -0,0 +1,342 @@ +import sys +import os +import json +import math +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import tf_graph_creation_helper as graph_creator +import plot_helper as plotter + +val_start_id = 89645 +val_set_size = 5000 +val_set_size_small = 500 + +def recall(pred_scores, gt_scores, k): + inc_order = np.argsort(pred_scores, 0) + dec_order = inc_order[::-1] + gt_scores_ordered = gt_scores[dec_order] + rel_reg_recalled = np.sum(gt_scores_ordered[0:k]!=0) + rel_reg = np.sum(gt_scores!=0) + return rel_reg_recalled/(rel_reg+0.00001) + + +def batch_recall(pred_scores, gt_scores, k): + batch_size = pred_scores.shape[0] + batch_recall = 0.0 + for i in xrange(batch_size): + if k==-1: + k_ = np.sum(gt_scores[i,:]!=0) + else: + k_ = k + batch_recall += recall(pred_scores[i,:], gt_scores[i,:], k_) + + batch_recall = batch_recall/batch_size + + return batch_recall + +def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, parsed_q_dict, + ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, + batch_size, plholder_dict, img_height, img_width, batch_creator, + verbose=False): + + recall_at_k = 0 + max_iter = int(math.floor(val_set_size/batch_size)) + for i in xrange(max_iter): + if verbose==True: + print('Iter: ' + str(i+1) + '/' + str(max_iter)) + region_images, ans_labels, parsed_q, \ + region_scores_vec, partition= batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, image_dir, mean_image, + start_index+i*batch_size, batch_size, + parsed_q_dict, + img_height, img_width, 3) + region_scores = batch_creator.reshape_score(region_scores_vec) + + feed_dict = ans_io_helper \ + .RelFeedDictCreator(region_images, + parsed_q, + region_scores, + 1.0, + plholder_dict, + vocab).feed_dict + + region_score_pred_eval = region_score_pred.eval(feed_dict) + print region_score_pred_eval + + recall_at_k += batch_recall(region_score_pred_eval, + region_scores, -1) + + recall_at_k /= max_iter + + return recall_at_k + + +def train(train_params): + sess = tf.InteractiveSession() + train_anno_filename = train_params['train_json'] + test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] + regions_anno_filename = train_params['regions_json'] + image_dir = train_params['image_dir'] + image_regions_dir = train_params['image_regions_dir'] + outdir = train_params['outdir'] + batch_size = train_params['batch_size'] + obj_atr_model = train_params['obj_atr_model'] + mode = train_params['mode'] + + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) + + # Save region crops + if train_params['crop_n_save_regions'] == True: + qa_anno_dict_test = ans_io_helper.parse_qa_anno(test_anno_filename) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict, region_anno_dict, + 1, 94644, 75, 75) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict_test, region_anno_dict, + 94645, 143495-94645+1, 75, 75) + + + # Create graph + g = tf.get_default_graph() + plholder_dict = \ + graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, + len(vocab), mode='gt') + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + y_pred = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, mode, + keep_prob, len(vocab), batch_size) + + accuracy = graph_creator.evaluation(y, y_pred) + + cross_entropy = graph_creator.loss(y, y_pred) + + # Collect variables + params_varnames = [ + 'rel/word_embed/word_vecs', + 'rel/conv1/W', + 'rel/conv2/W', + 'rel/conv1/b', + 'rel/conv2/b', + 'rel/fc1/W_reg', + 'rel/fc1/W_q', + 'rel/fc1/W_obj', + 'rel/fc1/W_atr', + 'rel/fc1/W_explt', + 'rel/fc1/b', + 'rel/fc2/W', + 'rel/fc2/b', + ] + + vars_dict = graph_creator.get_list_of_variables(params_varnames) + + # parameters grouped together + rel_word_params = [ + vars_dict['rel/word_embed/word_vecs'], + ] + + rel_conv_params = [ + vars_dict['rel/conv1/W'], + vars_dict['rel/conv2/W'], + ] + + rel_fc_params = [ + vars_dict['rel/fc1/W_reg'], + vars_dict['rel/fc1/W_q'], + vars_dict['rel/fc1/W_obj'], + vars_dict['rel/fc1/W_atr'], + vars_dict['rel/fc1/W_explt'], + vars_dict['rel/fc2/W'], + ] + + # Regularization + regularizer_rel_word_vecs = graph_creator.regularize_params(rel_word_params) + regularizer_rel_filters = graph_creator.regularize_params(rel_conv_params) + regularizer_rel_fcs = graph_creator.regularize_params(rel_fc_params) + + total_loss = cross_entropy + \ + 1e-4 * regularizer_rel_word_vecs + \ + 1e-3 * regularizer_rel_filters + \ + 1e-4 * regularizer_rel_fcs + + # Restore weights + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + + vars_to_save = rel_vars + atr_vars + obj_vars + vars_to_train = rel_vars[:] + pretrained_vars = atr_vars + obj_vars + + # Model to save and restore weights from + model_saver = tf.train.Saver(vars_to_save) + + if train_params['fine_tune']==True: + pretrained_model = os.path.join(outdir, 'rel_classifier_' + mode +'-'+ \ + str(train_params['start_model'])) + assert (os.path.exists(pretrained_model)), \ + 'Pretrained model does not exist' + model_saver.restore(sess, pretrained_model) + pretrained_vars = vars_to_save[:] + start_epoch = train_params['start_model'] + 1 + else: + assert (os.path.exists(obj_atr_model)), \ + 'Obj_Atr model does not exist' + obj_atr_restorer = tf.train.Saver(pretrained_vars) + obj_atr_restorer.restore(sess, obj_atr_model) + start_epoch = 0 + + # Attach optimization ops + train_step = tf.train.AdamOptimizer(train_params['adam_lr']) \ + .minimize(total_loss, var_list=vars_to_train) + + # Initialize uninitialized vars + all_vars = tf.get_collection(tf.GraphKeys.VARIABLES) + vars_to_init = [var for var in all_vars if var not in pretrained_vars] + sess.run(tf.initialize_variables(vars_to_init)) + + print('-----------------') + print 'Variables to train:' + print [var.name for var in vars_to_train] + print('-----------------') + print 'Pretrained variables:' + print [var.name for var in pretrained_vars] + print('-----------------') + print 'Variables to initialize:' + print [var.name for var in vars_to_init] + print('-----------------') + print 'Variables to save' + print [var.name for var in vars_to_save] + print('-----------------') + + # Load mean image + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Start Training + max_epoch = train_params['max_epoch'] + max_iter = 4400*2 + val_rec_array_epoch = np.zeros([max_epoch]) + train_rec_array_epoch = np.zeros([max_epoch]) + + # Batch creators + train_batch_creator = ans_io_helper.batch_creator(1, max_iter*batch_size) + val_batch_creator = ans_io_helper.batch_creator(val_start_id, val_start_id + + val_set_size - 1) + val_small_batch_creator = ans_io_helper.batch_creator(val_start_id, + val_start_id + + val_set_size_small-1) + + # Check accuracy of restored model + if train_params['fine_tune']==True: + restored_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, + vocab, image_regions_dir, + mean_image, val_start_id, + val_set_size, batch_size, + plholder_dict, 75, 75, + val_batch_creator) + print('Recall of restored model: ' + str(restored_recall)) + + # Accuracy filename + train_recall_txtfile = os.path.join(outdir,'train_recall_'+ mode +'.txt') + val_recall_txtfile = os.path.join(outdir,'val_recall_'+ mode +'.txt') + + for epoch in range(start_epoch, max_epoch): + train_batch_creator.shuffle_ids() + for i in range(max_iter): + + train_region_images, train_ans_labels, train_parsed_q, \ + train_region_score_vec, train_partition= train_batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + 1+i*batch_size, batch_size, + parsed_q_dict, + 75, 75, 3) + + train_region_score = train_batch_creator \ + .reshape_score(train_region_score_vec) + + feed_dict_train = ans_io_helper \ + .RelFeedDictCreator(train_region_images, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab).feed_dict + + _, current_train_batch_acc, y_pred_eval, loss_eval = \ + sess.run([train_step, accuracy, y_pred, total_loss], + feed_dict=feed_dict_train) + + assert (not np.any(np.isnan(y_pred_eval))), 'NaN predicted' + + train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] + \ + batch_recall(y_pred_eval, + train_region_score, -1) + + if (i+1)%500==0: + val_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size_small, + batch_size, plholder_dict, 75, 75, + val_small_batch_creator) + + print('Iter: ' + str(i+1) + ' Val Sm Rec: ' + str(val_recall)) + + train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] / max_iter + val_rec_array_epoch[epoch] = evaluate(y_pred, qa_anno_dict, + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size, + batch_size, plholder_dict, + 75, 75, val_batch_creator) + + print('Val Rec: ' + str(val_rec_array_epoch[epoch]) + + ' Train Rec: ' + str(train_rec_array_epoch[epoch])) + + + plotter.write_accuracy_to_file(start_epoch, epoch, + train_rec_array_epoch, + train_params['fine_tune'], + train_recall_txtfile) + plotter.write_accuracy_to_file(start_epoch, epoch, + val_rec_array_epoch, + train_params['fine_tune'], + val_recall_txtfile) + + save_path = model_saver.save(sess, + os.path.join(outdir, 'rel_classifier_' + \ + mode), + global_step=epoch) + + sess.close() + tf.reset_default_graph() + + diff --git a/classifiers/rel_graph_creator.py b/classifiers/rel_graph_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..91fe8e5aa60c5087191be0e82b61b8db99ef2d20 --- /dev/null +++ b/classifiers/rel_graph_creator.py @@ -0,0 +1,172 @@ +import numpy as np +import math +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +from tf_graph_creation_helper import weight_variable, bias_variable, \ + q_bin_embed_graph, conv2d, max_pool_2x2, batchnorm + + +class ans_graph_creator(): + def __init__(self, + plholder_dict, + obj_feat, + atr_feat, + obj_prob, + atr_prob, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_config, + mode='q_obj_atr', + is_train=True): + + self.mode = mode + self.is_train = plholder_dict['is_train'] + self.keep_prob = plholder_dict['keep_prob'] + image_regions = plholder_dict['image_regions'] + vocab_size = len(vocab) + + with tf.name_scope('ans') as ans_graph: + # Word Vectors + word_vecs = self.create_word_vecs(vocab_size, + graph_config['word_vec_dim']) + + # Feature Computations + q_feat = self.add_q_feat_comp_layer(word_vecs, plholder_dict) + reg_feat = self.add_reg_feat_comp_layer(image_regions) + + # Feature Projections (with batch norm) + feat_proj_dim = graph_config['joint_embed_dim'] + proj_feat = dict() + + proj_feat['q'] = self.fc_layer(q_feat, feat_proj_dim, + 'q_feat_proj_layer') + + proj_feat['reg'] = self.fc_layer(reg_feat, feat_proj_dim, + 'reg_feat_proj_layer') + + proj_feat['obj'] = self.fc_layer(obj_feat, feat_proj_dim, + 'obj_feat_proj_layer') + + proj_feat['atr'] = self.fc_layer(atr_feat, feat_proj_dim, + 'atr_feat_proj_layer') + + # Feature Combination + coeffs = self.mixing_coeffs() + print coeffs + num_regions = batch_size*ans_io_helper.num_proposals + comb_feat = tf.zeros(shape=[num_regions, feat_proj_dim], + dtype=tf.float32) + for feat_type, feat in proj_feat.items(): + comb_feat = comb_feat + feat * coeffs[feat_type] + + # Answer feature + ans_feat = self.compute_ans_feat(word_vecs, vocab, ans_vocab) + + # Proj answer + proj_ans_feat = self.fc_layer(ans_feat, feat_proj_dim, + 'ans_feat_proj_layer') + + # Compute Cosine Distance + self.cosine_dist = self.compute_cosine_dist(comb_feat, + proj_ans_feat) + + def create_word_vecs(self, vocab_size, word_vec_dim): + word_vecs = weight_variable([vocab_size, + word_vec_dim], + var_name='word_vecs') + word_vecs = tf.nn.l2_normalize(word_vecs, 1) + tf.add_to_collection('regularize',word_vecs) + return word_vecs + + def add_q_feat_comp_layer(self, word_vecs, plholder_dict): + with tf.name_scope('q_feat_comp_layer') as q_feat_comp_layer: + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, + [bin0_embed, bin1_embed, bin2_embed, bin3_embed], + name='q_feat') + return q_feat + + def add_reg_feat_comp_layer(self, image_regions): + with tf.name_scope('reg_feat_comp_layer') as reg_feat_comp_layer: + with tf.name_scope('conv1') as conv1: + W_conv1 = weight_variable([5,5,3,4]) + b_conv1 = bias_variable([4]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), + b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, self.keep_prob, + name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + W_conv2 = weight_variable([3,3,4,8]) + b_conv2 = bias_variable([8]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, self.keep_prob, + name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + reg_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in + h_pool2_drop_shape[1:]]) + reg_feat = tf.reshape(h_pool2_drop, [-1, reg_feat_dim], + name='reg_feat') + + tf.add_to_collection('regularize', W_conv1) + tf.add_to_collection('regularize', W_conv2) + + return reg_feat + + def fc_layer(self, feat, proj_dim, name_scope): + with tf.name_scope(name_scope) as fc_layer: + feat_dim = feat.get_shape()[1].value + W1 = weight_variable([feat_dim, proj_dim]) + b1 = bias_variable([proj_dim]) + proj_feat = tf.add(tf.matmul(feat, W1), b1) + bn_proj_feat = batchnorm(proj_feat, None, self.is_train) + W2 = weight_variable([proj_dim, proj_dim]) + b2 = bias_variable([proj_dim]) + bn_proj_feat = tf.add(tf.matmul(tf.nn.relu(bn_proj_feat), W2), b2) + tf.add_to_collection('regularize', W1) + tf.add_to_collection('regularize', W2) + + return bn_proj_feat + + def mixing_coeffs(self): + feat_types = ['q', 'obj', 'atr', 'reg'] + coeffs = dict() + count = 0; + for feat_type in feat_types: + if feat_type in self.mode: + coeffs[feat_type] = 1.0 + count += 1 + else: + coeffs[feat_type] = 0.0 + coeffs = {k: v/count for k, v in coeffs.items()} + return coeffs + + def compute_ans_feat(self, word_vecs, vocab, ans_vocab): + ans_vocab_size = len(ans_vocab) + inv_ans_vocab = {v:k for k, v in ans_vocab.items()} + ans_in_vocab_ids_list = [] + for i in xrange(ans_vocab_size): + ans_in_vocab_ids_list.append(vocab[inv_ans_vocab[i]]) + + ans_in_vocab_ids_tensor = tf.constant(ans_in_vocab_ids_list, + dtype=tf.int64) + ans_feat = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_tensor, + name='ans_feat') + return ans_feat + + def compute_cosine_dist(self, feat1, feat2): + feat1 = tf.nn.l2_normalize(feat1, 1) + feat2 = tf.nn.l2_normalize(feat2, 1) + return tf.matmul(feat1, tf.transpose(feat2)) + diff --git a/classifiers/tf_graph_creation_helper.py b/classifiers/tf_graph_creation_helper.py index 0a8c93f49cf937ba4a71265f20c38c2ee30f2182..9b2d36b1c732340ac0349bbf60881edeea933c32 100644 --- a/classifiers/tf_graph_creation_helper.py +++ b/classifiers/tf_graph_creation_helper.py @@ -14,6 +14,7 @@ graph_config = { 'q_embed_dim': 200, 'ans_fc1_dim': 300, 'rel_fc1_dim': 100, + 'joint_embed_dim': 100, } def get_variable(var_scope): @@ -108,6 +109,8 @@ def placeholder_inputs_ans(total_vocab_size, ans_vocab_size, mode='gt'): 'questions'), 'region_score': tf.placeholder(tf.float32, [1,None], 'region_score'), + + 'is_train': tf.placeholder(tf.bool, [], 'is_train') } for i in xrange(4): bin_name = 'bin' + str(i) @@ -322,6 +325,7 @@ def rel_comp_graph(plholder_dict, obj_feat, atr_feat, a_atr_fc1 = tf.matmul(atr_feat, W_atr_fc1, name='a_atr_fc1') a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, name='a_explt_fc1') + coeff = { 'reg': 0.0, 'q': 0.0, @@ -486,7 +490,7 @@ def ans_comp_graph(plholder_dict, obj_feat, atr_feat, def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, - vocab, inv_vocab, ans_vocab, mode): + vocab, inv_vocab, ans_vocab, mode, train): vocab_size = len(vocab) image_regions = plholder_dict['image_regions'] keep_prob = plholder_dict['keep_prob'] @@ -518,7 +522,7 @@ def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, ans_embed = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_list, name='ans_embed') - + with tf.name_scope('explicit_feat') as expl_feat: explt_feat_list = [] for bin_num in xrange(4): @@ -582,6 +586,13 @@ def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, a_fc1_q = tf.matmul(q_feat, W_q_fc1, name='a_fc1_q') a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, name='a_explt_fc1') + + a_fc1_region = batchnorm(a_fc1_region, 'reg', train) + a_fc1_obj = batchnorm(a_fc1_obj, 'obj', train) + a_fc1_atr = batchnorm(a_fc1_atr, 'atr', train) + a_fc1_q = batchnorm(a_fc1_q, 'q', train) + a_explt_fc1 = batchnorm(a_explt_fc1, 'explt', train) + coeff_reg = 0.0 coeff_obj = 0.0 coeff_atr = 0.0 @@ -611,7 +622,8 @@ def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, a_fc1 = coeff_reg * a_fc1_region + \ coeff_obj * a_fc1_obj + \ coeff_atr * a_fc1_atr + \ - coeff_q * a_fc1_q + coeff_q * a_fc1_q + \ + coeff_explt * a_explt_fc1 h_fc1 = tf.nn.relu(a_fc1, name='h') h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='h_drop') @@ -631,12 +643,17 @@ def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, b_feat_fc2, name='comb_feat_embed') comb_ans_embed = tf.add(tf.matmul(ans_embed, W_ans_fc2), - b_ans_fc2, - name='comb_feat_embed') + b_ans_fc2, + name='comb_feat_embed') + comb_feat_embed = batchnorm(comb_feat_embed, 'feat_embed', train) + comb_ans_embed = batchnorm(comb_ans_embed, 'ans_embed', train) + comb_feat_embed = tf.nn.l2_normalize(comb_feat_embed, 1) + comb_ans_embed = tf.nn.l2_normalize(comb_ans_embed,1) ans_scores = tf.matmul(comb_feat_embed, tf.transpose(comb_ans_embed), name='ans_scores') - ans_scores = tf.nn.l2_normalize(ans_scores, 1)*3.0 + #ans_scores = tf.nn.l2_normalize(ans_scores, 1)*3.0 return tf.nn.softmax(ans_scores) + def aggregate_y_pred(y_pred, region_score, batch_size, num_proposals, @@ -669,7 +686,7 @@ def loss(y, y_pred): def margin_loss(y, y_pred, margin): correct_score = tf.reduce_sum(tf.mul(y, y_pred), 1, keep_dims=True, name='correct_score') - return tf.reduce_mean(tf.maximum(0.0, y + margin - correct_score)) + return tf.reduce_mean(tf.maximum(0.0, y_pred + margin - correct_score)) def regularize_params(param_list): @@ -679,6 +696,40 @@ def regularize_params(param_list): return regularizer +def batchnorm(input, suffix, is_train, decay=0.95, epsilon=1e-4, name='bn'): + rank = len(input.get_shape().as_list()) + in_dim = input.get_shape().as_list()[-1] + + if rank == 2: + axes = [0] + elif rank == 4: + axes = [0, 1, 2] + else: + raise ValueError('Input tensor must have rank 2 or 4.') + + if suffix: + suffix = '_' + suffix + else: + suffix = '' + + mean, variance = tf.nn.moments(input, axes) + offset = tf.Variable(initial_value=tf.constant(value=0.0, shape=[in_dim]), + name='offset' + suffix) + scale = tf.Variable(initial_value=tf.constant(value=1.0, shape=[in_dim]), + name='scale' + suffix) + + ema = tf.train.ExponentialMovingAverage(decay=decay) + ema_apply_op = ema.apply([mean, variance]) + ema_mean, ema_var = ema.average(mean), ema.average(variance) + + with tf.control_dependencies([ema_apply_op]): + bn_train = tf.nn.batch_normalization(input, mean, variance, + offset, scale, epsilon, name) + bn_test = tf.nn.batch_normalization(input, ema_mean, ema_var, + offset, scale, epsilon, name) + return tf.cond(is_train, lambda : bn_train, lambda : bn_test) + + if __name__ == '__main__': lg_dir = '/home/tanmay/Code/GenVQA/Exp_Results/lg_files/' diff --git a/classifiers/train_classifiers.py b/classifiers/train_classifiers.py index b8972ded3d2aada441c8e0f3a96ea759a0e1835f..46c1acd00521f8ab9bc6ed0e8ec6585a5d033cdc 100644 --- a/classifiers/train_classifiers.py +++ b/classifiers/train_classifiers.py @@ -1,3 +1,4 @@ + import sys import json import os @@ -9,7 +10,8 @@ import object_classifiers.train_obj_classifier as obj_trainer import object_classifiers.eval_obj_classifier as obj_evaluator import attribute_classifiers.train_atr_classifier as atr_trainer import attribute_classifiers.eval_atr_classifier as atr_evaluator -import answer_classifier.train_ans_classifier as ans_trainer +#import answer_classifier.train_ans_classifier as ans_trainer +import answer_classifier.train_ans_classifier_simple as ans_trainer import region_ranker.train_rel_classifier as rel_trainer import region_ranker.eval_rel_classifier as rel_evaluator @@ -108,10 +110,10 @@ ans_classifier_train_params = { 'adam_lr' : 0.0001, 'mode' : 'q_obj_atr', 'crop_n_save_regions': False, - 'max_epoch': 10, + 'max_epoch': 5, 'batch_size': 10, 'fine_tune': True, - 'start_model': 4, # When fine_tune is false used to pre-initialize q_obj_atr with q model etc + 'start_model': 1, # When fine_tune is false used to pre-initialize q_obj_atr with q model etc } if __name__=='__main__':