diff --git a/classifiers/ans_graph_creator.py b/classifiers/ans_graph_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..91fe8e5aa60c5087191be0e82b61b8db99ef2d20 --- /dev/null +++ b/classifiers/ans_graph_creator.py @@ -0,0 +1,172 @@ +import numpy as np +import math +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +from tf_graph_creation_helper import weight_variable, bias_variable, \ + q_bin_embed_graph, conv2d, max_pool_2x2, batchnorm + + +class ans_graph_creator(): + def __init__(self, + plholder_dict, + obj_feat, + atr_feat, + obj_prob, + atr_prob, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_config, + mode='q_obj_atr', + is_train=True): + + self.mode = mode + self.is_train = plholder_dict['is_train'] + self.keep_prob = plholder_dict['keep_prob'] + image_regions = plholder_dict['image_regions'] + vocab_size = len(vocab) + + with tf.name_scope('ans') as ans_graph: + # Word Vectors + word_vecs = self.create_word_vecs(vocab_size, + graph_config['word_vec_dim']) + + # Feature Computations + q_feat = self.add_q_feat_comp_layer(word_vecs, plholder_dict) + reg_feat = self.add_reg_feat_comp_layer(image_regions) + + # Feature Projections (with batch norm) + feat_proj_dim = graph_config['joint_embed_dim'] + proj_feat = dict() + + proj_feat['q'] = self.fc_layer(q_feat, feat_proj_dim, + 'q_feat_proj_layer') + + proj_feat['reg'] = self.fc_layer(reg_feat, feat_proj_dim, + 'reg_feat_proj_layer') + + proj_feat['obj'] = self.fc_layer(obj_feat, feat_proj_dim, + 'obj_feat_proj_layer') + + proj_feat['atr'] = self.fc_layer(atr_feat, feat_proj_dim, + 'atr_feat_proj_layer') + + # Feature Combination + coeffs = self.mixing_coeffs() + print coeffs + num_regions = batch_size*ans_io_helper.num_proposals + comb_feat = tf.zeros(shape=[num_regions, feat_proj_dim], + dtype=tf.float32) + for feat_type, feat in proj_feat.items(): + comb_feat = comb_feat + feat * coeffs[feat_type] + + # Answer feature + ans_feat = self.compute_ans_feat(word_vecs, vocab, ans_vocab) + + # Proj answer + proj_ans_feat = self.fc_layer(ans_feat, feat_proj_dim, + 'ans_feat_proj_layer') + + # Compute Cosine Distance + self.cosine_dist = self.compute_cosine_dist(comb_feat, + proj_ans_feat) + + def create_word_vecs(self, vocab_size, word_vec_dim): + word_vecs = weight_variable([vocab_size, + word_vec_dim], + var_name='word_vecs') + word_vecs = tf.nn.l2_normalize(word_vecs, 1) + tf.add_to_collection('regularize',word_vecs) + return word_vecs + + def add_q_feat_comp_layer(self, word_vecs, plholder_dict): + with tf.name_scope('q_feat_comp_layer') as q_feat_comp_layer: + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, + [bin0_embed, bin1_embed, bin2_embed, bin3_embed], + name='q_feat') + return q_feat + + def add_reg_feat_comp_layer(self, image_regions): + with tf.name_scope('reg_feat_comp_layer') as reg_feat_comp_layer: + with tf.name_scope('conv1') as conv1: + W_conv1 = weight_variable([5,5,3,4]) + b_conv1 = bias_variable([4]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), + b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, self.keep_prob, + name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + W_conv2 = weight_variable([3,3,4,8]) + b_conv2 = bias_variable([8]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, self.keep_prob, + name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + reg_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in + h_pool2_drop_shape[1:]]) + reg_feat = tf.reshape(h_pool2_drop, [-1, reg_feat_dim], + name='reg_feat') + + tf.add_to_collection('regularize', W_conv1) + tf.add_to_collection('regularize', W_conv2) + + return reg_feat + + def fc_layer(self, feat, proj_dim, name_scope): + with tf.name_scope(name_scope) as fc_layer: + feat_dim = feat.get_shape()[1].value + W1 = weight_variable([feat_dim, proj_dim]) + b1 = bias_variable([proj_dim]) + proj_feat = tf.add(tf.matmul(feat, W1), b1) + bn_proj_feat = batchnorm(proj_feat, None, self.is_train) + W2 = weight_variable([proj_dim, proj_dim]) + b2 = bias_variable([proj_dim]) + bn_proj_feat = tf.add(tf.matmul(tf.nn.relu(bn_proj_feat), W2), b2) + tf.add_to_collection('regularize', W1) + tf.add_to_collection('regularize', W2) + + return bn_proj_feat + + def mixing_coeffs(self): + feat_types = ['q', 'obj', 'atr', 'reg'] + coeffs = dict() + count = 0; + for feat_type in feat_types: + if feat_type in self.mode: + coeffs[feat_type] = 1.0 + count += 1 + else: + coeffs[feat_type] = 0.0 + coeffs = {k: v/count for k, v in coeffs.items()} + return coeffs + + def compute_ans_feat(self, word_vecs, vocab, ans_vocab): + ans_vocab_size = len(ans_vocab) + inv_ans_vocab = {v:k for k, v in ans_vocab.items()} + ans_in_vocab_ids_list = [] + for i in xrange(ans_vocab_size): + ans_in_vocab_ids_list.append(vocab[inv_ans_vocab[i]]) + + ans_in_vocab_ids_tensor = tf.constant(ans_in_vocab_ids_list, + dtype=tf.int64) + ans_feat = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_tensor, + name='ans_feat') + return ans_feat + + def compute_cosine_dist(self, feat1, feat2): + feat1 = tf.nn.l2_normalize(feat1, 1) + feat2 = tf.nn.l2_normalize(feat2, 1) + return tf.matmul(feat1, tf.transpose(feat2)) + diff --git a/classifiers/answer_classifier/ans_data_io_helper.py b/classifiers/answer_classifier/ans_data_io_helper.py index e6ab0c31315c210f9318b94c8121ed9c794b069f..959e70ed8c68f3430e9c16c7acf9fda58a5664d5 100644 --- a/classifiers/answer_classifier/ans_data_io_helper.py +++ b/classifiers/answer_classifier/ans_data_io_helper.py @@ -51,6 +51,17 @@ def parse_qa_anno(json_filename): return qa_dict +def read_parsed_questions(json_filename): + with open(json_filename, 'r') as json_file: + raw_data = json.load(json_file) + + parsed_q_dict = dict() + for entry in raw_data: + parsed_q_dict[entry['question_id']] = entry['question_parse'] + + return parsed_q_dict + + def get_vocab(qa_dict): vocab = dict() count = 0; @@ -70,6 +81,17 @@ def get_vocab(qa_dict): return vocab, inv_vocab +def join_vocab(vocab, ans_vocab): + joint_vocab = vocab.copy() + count = len(joint_vocab) + for word in ans_vocab.keys(): + if word not in joint_vocab: + joint_vocab[word] = count + count += 1 + + return joint_vocab + + def save_regions(image_dir, out_dir, qa_dict, region_anno_dict, start_id, batch_size, img_width, img_height): @@ -80,7 +102,7 @@ def save_regions(image_dir, out_dir, qa_dict, region_anno_dict, start_id, region_shape = np.array([img_height/3, img_width/3], np.int32) image_done = dict() - for i in xrange(batch_size): + for i in xrange(start_id, start_id + batch_size): image_id = qa_dict[i].image_id image_done[image_id] = False @@ -124,7 +146,8 @@ class batch_creator(): def ans_mini_batch_loader(self, qa_dict, region_anno_dict, ans_dict, vocab, image_dir, mean_image, start_index, batch_size, - img_height=100, img_width=100, channels = 3): + parsed_q_dict, img_height=100, img_width=100, + channels = 3): q_ids = self.qa_index(start_index, batch_size) @@ -141,8 +164,8 @@ class batch_creator(): region_shape[1], channels]) region_score = np.zeros(shape=[1,count]) partition = np.zeros(shape=[count]) - question_encodings = np.zeros(shape=[count, len(vocab)]) - + parsed_q = dict() + for i in xrange(batch_size): q_id = q_ids[i] image_id = qa_dict[q_id].image_id @@ -155,18 +178,9 @@ class batch_creator(): gt_regions_for_image, False) - question_encoding_tmp = np.zeros(shape=[1, len(vocab)]) - for word in question[0:-1].split(): - if word.lower() not in vocab: - word = 'unk' - question_encoding_tmp[0, vocab[word.lower()]] += 1 - - question_len = np.sum(question_encoding_tmp) - assert (not question_len==0) - question_encoding_tmp /= question_len - for j in xrange(num_proposals): counter = j + i*num_proposals + parsed_q[counter] = parsed_q_dict[q_id] proposal = regions[j] resized_region = mpimg.imread(os.path.join(image_dir, '{}_{}.png'.format(image_id,j))) @@ -175,14 +189,14 @@ class batch_creator(): region_score[0,counter] = proposal.score partition[counter] = i - question_encodings[counter,:] = question_encoding_tmp - score_start_id = i*num_proposals region_score[0, score_start_id:score_start_id+num_proposals] /=\ - np.sum(region_score[0,score_start_id + np.sum(region_score[0,score_start_id : score_start_id+num_proposals]) - return region_images, ans_labels, question_encodings, \ + + return region_images, ans_labels, parsed_q, \ region_score, partition + def reshape_score(self, region_score): num_cols = num_proposals @@ -193,6 +207,102 @@ class batch_creator(): return np.reshape(region_score,[num_rows, num_cols],'C') + +obj_labels = { + 0: 'blank', + 1: 'square', + 2: 'triangle', + 3: 'circle', +} + + +atr_labels = { + 0: 'red', + 1: 'green', + 2: 'blue', + 3: 'blank', +} + + +class FeedDictCreator(): + def __init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab): + self.plholder_dict = plholder_dict + self.parsed_q = parsed_q + self.vocab = vocab + self.max_words = 5 + self.feed_dict = { + plholder_dict['image_regions']: region_images, + plholder_dict['keep_prob']: keep_prob, + } + self.add_bin('bin0') + self.add_bin('bin1') + self.add_bin('bin2') + self.add_bin('bin3') + for i in xrange(4): + bin_name = 'bin' + str(i) + self.label_bin_containment(bin_name, obj_labels, 'obj') + self.label_bin_containment(bin_name, atr_labels, 'atr') + + def add_bin(self, bin_name): + num_q = len(self.parsed_q) + shape_list = [num_q, len(self.vocab)] + indices_list = [] + values_list = [] + for q_num in xrange(num_q): + item = self.parsed_q[q_num] + word_list = item[bin_name] + num_words = len(word_list) + assert_str = 'number of bin words exceeded limit' + assert (num_words <= self.max_words), assert_str + for word_num, word in enumerate(word_list): + if word=='': + word = 'unk' + indices_list.append((q_num, word_num)) + values_list.append(self.vocab[word.lower()]) + + # convert to numpy arrays + shape = np.asarray(shape_list) + indices = np.asarray(indices_list) + values = np.asarray(values_list) + self.feed_dict[self.plholder_dict[bin_name + '_indices']] = indices + self.feed_dict[self.plholder_dict[bin_name + '_values']] = values + self.feed_dict[self.plholder_dict[bin_name + '_shape']] = shape + + def label_bin_containment(self, bin_name, labels, label_type): + num_q = len(self.parsed_q) + num_labels = len(labels) + containment = np.zeros([num_q, num_labels], dtype='float32') + for q_num in xrange(num_q): + for i, label in labels.items(): + if label in [pq.lower() for pq in \ + self.parsed_q[q_num][bin_name]]: + containment[q_num,i] = 1 + + plholder = self.plholder_dict[bin_name + '_' + \ + label_type + '_' + 'cont'] + self.feed_dict[plholder] = containment + + +class RelFeedDictCreator(FeedDictCreator): + def __init__(self, region_images, parsed_q, + gt_region_scores, keep_prob, plholder_dict, vocab, is_train): + FeedDictCreator.__init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab) + self.feed_dict[plholder_dict['gt_scores']] = gt_region_scores + self.feed_dict[plholder_dict['is_train']] = is_train + + +class AnsFeedDictCreator(FeedDictCreator): + def __init__(self, region_images, ans_labels, parsed_q, + region_scores, keep_prob, plholder_dict, vocab, is_train): + FeedDictCreator.__init__(self, region_images, parsed_q, + keep_prob, plholder_dict, vocab) + self.feed_dict[plholder_dict['gt_answer']] = ans_labels + self.feed_dict[plholder_dict['region_score']] = region_scores + self.feed_dict[plholder_dict['is_train']] = is_train + + class html_ans_table_writer(): def __init__(self, filename): self.filename = filename diff --git a/classifiers/answer_classifier/eval_ans_classifier.py b/classifiers/answer_classifier/eval_ans_classifier.py index f0c66054ff6177982c9f7c8fbb377d77f9636c63..248336b37cf775a1fb13c3b7e105ad45040ac713 100644 --- a/classifiers/answer_classifier/eval_ans_classifier.py +++ b/classifiers/answer_classifier/eval_ans_classifier.py @@ -15,9 +15,9 @@ import region_ranker.perfect_ranker as region_proposer import train_ans_classifier as ans_trainer from PIL import Image, ImageDraw -def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, +def get_pred(y, qa_anno_dict, region_anno_dict, parsed_q_dict, ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, batch_size, - placeholders, img_height, img_width, batch_creator): + plholder_dict, img_height, img_width, batch_creator): inv_ans_vocab = {v: k for k, v in ans_vocab.items()} pred_list = [] @@ -30,14 +30,14 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, print('Iter: ' + str(i+1) + '/' + str(max_iter)) - region_images, ans_labels, questions, \ + region_images, ans_labels, parsed_q, \ region_score, partition = batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index+i*batch_size, - batch_size_tmp, + batch_size_tmp, parsed_q_dict, img_height, img_width, 3) if i==max_iter-1: @@ -48,8 +48,9 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, residual_region_images = np.zeros(shape=[residual_regions, img_height/3, img_width/3, 3]) - residual_questions = np.zeros(shape=[residual_regions, - len(vocab)]) + # residual_questions = np.zeros(shape=[residual_regions, + # len(vocab)]) + residual_ans_labels = np.zeros(shape=[residual_batch_size, len(ans_vocab)]) residual_region_score = np.zeros(shape=[1, residual_regions]) @@ -57,19 +58,29 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, region_images = np.concatenate((region_images, residual_region_images), axis=0) - questions = np.concatenate((questions, residual_questions), axis=0) +# questions = np.concatenate((questions, residual_questions), axis=0) + for k in xrange(batch_size_tmp*22, batch_size*22): + parsed_q[k] = { + 'bin0': [''], + 'bin1': [''], + 'bin2': [''], + 'bin3': [''], + } + ans_labels = np.concatenate((ans_labels, residual_ans_labels), axis=0) region_score = np.concatenate((region_score, residual_region_score), axis=1) - feed_dict = { - placeholders[0] : region_images, - placeholders[1] : questions, - placeholders[2] : 1.0, - placeholders[3] : ans_labels, - placeholders[4] : region_score, - } + + feed_dict = ans_io_helper \ + .AnsFeedDictCreator(region_images, + ans_labels, + parsed_q, + region_score, + 1.0, + plholder_dict, + vocab).feed_dict ans_ids = np.argmax(y.eval(feed_dict), 1) for j in xrange(batch_size_tmp): @@ -78,13 +89,6 @@ def get_pred(y, qa_anno_dict, region_anno_dict, ans_vocab, vocab, 'answer' : inv_ans_vocab[ans_ids[j]] }] - # g = tf.get_default_graph() - # q_feat_op = g.get_operation_by_name('ans/word_embed/q_feat') - # q_feat = q_feat_op.outputs[0] - # region_feat_op = g.get_operation_by_name('ans/conv2/region_feat') - # region_feat = region_feat_op.outputs[0] - # pdb.set_trace() - return pred_list def eval(eval_params): @@ -92,6 +96,7 @@ def eval(eval_params): train_anno_filename = eval_params['train_json'] test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] regions_anno_filename = eval_params['regions_json'] image_regions_dir = eval_params['image_regions_dir'] outdir = eval_params['outdir'] @@ -104,38 +109,47 @@ def eval(eval_params): qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) # Create graph g = tf.get_default_graph() - image_regions, questions, keep_prob, y, region_score= \ - graph_creator.placeholder_inputs_ans(len(vocab), len(ans_vocab), - mode='gt') - + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - pred_rel_score = graph_creator.rel_comp_graph(image_regions, questions, + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, 'q_obj_atr_reg', 1.0, len(vocab), batch_size) - y_pred = graph_creator.ans_comp_graph(image_regions, questions, keep_prob, - obj_feat, atr_feat, vocab, - inv_vocab, len(ans_vocab), - eval_params['mode']) + y_pred = graph_creator.ans_comp_margin_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + vocab, inv_vocab, ans_vocab, + eval_params['mode']) pred_rel_score_vec = tf.reshape(pred_rel_score, [1, batch_size*ans_io_helper.num_proposals]) + y_avg = graph_creator.aggregate_y_pred(y_pred, pred_rel_score_vec, batch_size, ans_io_helper.num_proposals, len(ans_vocab)) - cross_entropy = graph_creator.loss(y, y_avg) accuracy = graph_creator.evaluation(y, y_avg) # Collect variables @@ -160,16 +174,15 @@ def eval(eval_params): mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ 'Obj_Classifier/mean_image.npy') - placeholders = [image_regions, questions, keep_prob, y, region_score] - # Batch creator test_batch_creator = ans_io_helper.batch_creator(test_start_id, test_start_id + test_set_size - 1) # Get predictions - pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, ans_vocab, + pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, test_start_id, - test_set_size, batch_size, placeholders, 75, 75, + test_set_size, batch_size, plholder_dict, 75, 75, test_batch_creator) json_filename = os.path.join(outdir, 'predicted_ans_' + \ @@ -271,11 +284,12 @@ if __name__=='__main__': 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel', - 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob/rel_classifier_q_obj_atr_reg-4', - 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel/ans_classifier_' + mode + '-' + str(model_num), + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt/rel_classifier_q_obj_atr_reg_explt-9', + 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin/ans_classifier_' + mode + '-' + str(model_num), 'mode' : mode, 'batch_size': 20, 'test_start_id': 94645, diff --git a/classifiers/answer_classifier/eval_ans_classifier_simple.py b/classifiers/answer_classifier/eval_ans_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..602d359f1cb6e504c2947c3bdb1fc62a5738f73b --- /dev/null +++ b/classifiers/answer_classifier/eval_ans_classifier_simple.py @@ -0,0 +1,330 @@ +import sys +import os +import json +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import math +import random +import pdb +import tensorflow as tf +import tf_graph_creation_helper as graph_creator +import ans_graph_creator +import rel_graph_creator +import plot_helper as plotter +import ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import train_ans_classifier as ans_trainer +from PIL import Image, ImageDraw + +def get_pred(y, qa_anno_dict, region_anno_dict, parsed_q_dict, ans_vocab, vocab, + image_dir, mean_image, start_index, val_set_size, batch_size, + plholder_dict, img_height, img_width, batch_creator): + + inv_ans_vocab = {v: k for k, v in ans_vocab.items()} + pred_list = [] + correct = 0 + max_iter = int(math.ceil(val_set_size*1.0/batch_size)) + batch_size_tmp = batch_size + for i in xrange(max_iter): + if i==(max_iter-1): + batch_size_tmp = val_set_size - i*batch_size + + print('Iter: ' + str(i+1) + '/' + str(max_iter)) + + region_images, ans_labels, parsed_q, \ + region_score, partition = batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, + region_anno_dict, + ans_vocab, vocab, + image_dir, mean_image, + start_index+i*batch_size, + batch_size_tmp, parsed_q_dict, + img_height, img_width, 3) + + if i==max_iter-1: + + residual_batch_size = batch_size - batch_size_tmp + residual_regions = residual_batch_size*ans_io_helper.num_proposals + + residual_region_images = np.zeros(shape=[residual_regions, + img_height/3, img_width/3, + 3]) + # residual_questions = np.zeros(shape=[residual_regions, + # len(vocab)]) + + residual_ans_labels = np.zeros(shape=[residual_batch_size, + len(ans_vocab)]) + residual_region_score = np.zeros(shape=[1, residual_regions]) + + region_images = np.concatenate((region_images, + residual_region_images), + axis=0) +# questions = np.concatenate((questions, residual_questions), axis=0) + for k in xrange(batch_size_tmp*22, batch_size*22): + parsed_q[k] = { + 'bin0': [''], + 'bin1': [''], + 'bin2': [''], + 'bin3': [''], + } + + ans_labels = np.concatenate((ans_labels, residual_ans_labels), + axis=0) + region_score = np.concatenate((region_score, residual_region_score), + axis=1) + + + feed_dict = ans_io_helper \ + .AnsFeedDictCreator(region_images, + ans_labels, + parsed_q, + region_score, + 1.0, + plholder_dict, + vocab, + False).feed_dict + + ans_ids = np.argmax(y.eval(feed_dict), 1) + for j in xrange(batch_size_tmp): + pred_list = pred_list + [{ + 'question_id' : start_index+i*batch_size+j, + 'answer' : inv_ans_vocab[ans_ids[j]] + }] + + return pred_list + +def eval(eval_params): + sess = tf.InteractiveSession() + + train_anno_filename = eval_params['train_json'] + test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] + regions_anno_filename = eval_params['regions_json'] + image_regions_dir = eval_params['image_regions_dir'] + outdir = eval_params['outdir'] + model = eval_params['model'] + batch_size = eval_params['batch_size'] + test_start_id = eval_params['test_start_id'] + test_set_size = eval_params['test_set_size'] + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) + qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) + + # Create graph + g = tf.get_default_graph() + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + # pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + # obj_feat, atr_feat, + # y_pred_obj, y_pred_atr, + # 'q_obj_atr_reg', + # 1.0, len(vocab), batch_size) + + rel_graph = rel_graph_creator.rel_graph_creator(plholder_dict, + 1.0, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + len(vocab), + batch_size, + graph_creator.graph_config, + 'q_obj_atr_reg_explt', + False) + + pred_rel_score = rel_graph.rel_score + + ans_graph = ans_graph_creator.ans_graph_creator(plholder_dict, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_creator.graph_config, + eval_params['mode'], + True) + + y_pred = ans_graph.cosine_dist + + pred_rel_score_vec = tf.reshape(pred_rel_score, + [1, batch_size*ans_io_helper.num_proposals]) + + y_avg = graph_creator.aggregate_y_pred(y_pred, + region_score, + batch_size, + ans_io_helper.num_proposals, + len(ans_vocab)) + + accuracy = graph_creator.evaluation(y, y_avg) + + # Collect variables + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + + # Restore model + restorer = tf.train.Saver() + if os.path.exists(model): + restorer.restore(sess, model) + else: + print 'Failed to read model from file ' + model + +# sess.run(tf.initialize_variables(vars_to_init)) + + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Batch creator + test_batch_creator = ans_io_helper.batch_creator(test_start_id, + test_start_id + + test_set_size - 1) + # Get predictions + pred_dict = get_pred(y_avg, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, + vocab, image_regions_dir, mean_image, test_start_id, + test_set_size, batch_size, plholder_dict, 75, 75, + test_batch_creator) + + json_filename = os.path.join(outdir, 'predicted_ans_' + \ + eval_params['mode'] + '.json') + with open(json_filename,'w') as json_file: + json.dump(pred_dict, json_file) + + +def create_html_file(outdir, test_anno_filename, regions_anno_filename, + pred_json_filename, image_dir, num_pred_to_display, mode): + qa_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + + with open(pred_json_filename,'r') as json_file: + raw_data = json.load(json_file) + + # Create director for storing images with region boxes + images_bbox_dir = os.path.join(outdir, 'images_bbox' + '_' + mode) + if not os.path.exists(images_bbox_dir): + os.mkdir(images_bbox_dir) + + col_dict = { + 0 : 'Question_Id', + 1 : 'Question', + 2 : 'Answer (GT)', + 3 : 'Answer (Pred)', + 4 : 'Image', + } + html_correct_filename = os.path.join(outdir, + 'correct_ans_' + mode + '.html') + html_writer_correct = ans_io_helper \ + .html_ans_table_writer(html_correct_filename) + html_writer_correct.add_element(col_dict) + + html_incorrect_filename = os.path.join(outdir, + 'incorrect_ans_' + mode + '.html') + html_writer_incorrect = ans_io_helper \ + .html_ans_table_writer(html_incorrect_filename) + html_writer_incorrect.add_element(col_dict) + + region_coords, region_coords_ = region_proposer.get_region_coords(300,300) + + random.shuffle(raw_data) + + count = 0 + for entry in raw_data: + if count == num_pred_to_display: + break + q_id = entry['question_id'] + pred_ans = entry['answer'] + gt_ans = qa_dict[q_id].answer + question = qa_dict[q_id].question + img_id = qa_dict[q_id].image_id + image_filename = os.path.join(image_dir, str(img_id) + '.jpg') + image = Image.open(image_filename) + + regions = region_proposer.rank_regions(image, question, region_coords, + region_coords_, + region_anno_dict[img_id], + crop=False) + dr = ImageDraw.Draw(image) + # print(q_id) + # print([regions[key].score for key in regions.keys()]) + for i in xrange(ans_io_helper.num_proposals): + if not regions[i].score==0: + coord = regions[i].coord + x1 = coord[0] + y1 = coord[1] + x2 = coord[2] + y2 = coord[3] + dr.rectangle([(x1,y1),(x2,y2)], outline="red") + + image_bbox_filename = os.path.join(images_bbox_dir,str(q_id) + '.jpg') + image.save(image_bbox_filename) + image_bbox_filename_rel = 'images_bbox_'+ mode +'/'+ str(q_id) + '.jpg' + col_dict = { + 0 : q_id, + 1 : question, + 2 : gt_ans, + 3 : pred_ans, + 4 : html_writer_correct.image_tag(image_bbox_filename_rel,50,50) + } + if pred_ans==gt_ans: + html_writer_correct.add_element(col_dict) + else: + html_writer_incorrect.add_element(col_dict) + + count += 1 + + html_writer_correct.close_file() + html_writer_incorrect.close_file() + + +if __name__=='__main__': + mode = 'q_obj_atr_reg' + model_num = 4 + ans_classifier_eval_params = { + 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', + 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', + 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', + 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', + 'image_regions_dir': '/mnt/ramdisk/image_regions', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt_At_End/rel_classifier_q_obj_atr_reg_explt-4', + 'model': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin/ans_classifier_' + mode + '-' + str(model_num), + 'mode' : mode, + 'batch_size': 20, + 'test_start_id': 94645, + 'test_set_size': 143495-94645+1, + } + + eval(ans_classifier_eval_params) + outdir = ans_classifier_eval_params['outdir'] + test_anno_filename = ans_classifier_eval_params['test_json'] + regions_anno_filename = ans_classifier_eval_params['regions_json'] + pred_json_filename = os.path.join(outdir, 'predicted_ans_'+ mode +'.json') + image_dir = ans_classifier_eval_params['image_dir'] + create_html_file(outdir, test_anno_filename, regions_anno_filename, + pred_json_filename, image_dir, 1000, mode) diff --git a/classifiers/answer_classifier/train_ans_classifier.py b/classifiers/answer_classifier/train_ans_classifier.py index 6d4213eae4ab8f51de2daf44c26ff39c2292832b..4b4f1ca8a89da827f6ca65f02794bf0bcd831823 100644 --- a/classifiers/answer_classifier/train_ans_classifier.py +++ b/classifiers/answer_classifier/train_ans_classifier.py @@ -31,9 +31,12 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): 'ans/fc1/W_obj', 'ans/fc1/W_atr', 'ans/fc1/W_q', + 'ans/fc1/W_explt', 'ans/fc1/b', - 'ans/fc2/W', - 'ans/fc2/b' + 'ans/fc2/W_feat', + 'ans/fc2/b_feat', + 'ans/fc2/W_ans', + 'ans/fc2/b_ans', ] vars_dict = graph_creator.get_list_of_variables(list_of_vars) @@ -43,8 +46,10 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): vars_dict['ans/word_embed/word_vecs'], vars_dict['ans/fc1/W_q'], vars_dict['ans/fc1/b'], - vars_dict['ans/fc2/W'], - vars_dict['ans/fc2/b'], + vars_dict['ans/fc2/W_feat'], + vars_dict['ans/fc2/b_feat'], + vars_dict['ans/fc2/W_ans'], + vars_dict['ans/fc2/b_ans'], ] reg_ans_params = [ @@ -57,6 +62,7 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): obj_ans_params = [ vars_dict['ans/fc1/W_obj'], + vars_dict['ans/fc1/W_explt'] ] atr_ans_params = [ @@ -88,9 +94,9 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): elif mode=='q_obj_atr_reg': vars_to_train += reg_ans_params - if not mode=='q': - vars_to_train = [var for var in vars_to_train if \ - 'ans/word_embed/word_vecs' not in var.name] + # if not mode=='q': + # vars_to_train = [var for var in vars_to_train if \ + # 'ans/word_embed/word_vecs' not in var.name] # Fine tune begining with a previous model if fine_tune==True: @@ -133,25 +139,24 @@ def get_process_flow_vars(mode, obj_vars, atr_vars, rel_vars, fine_tune): def evaluate(accuracy, qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, batch_size, - placeholders, img_height, img_width, batch_creator): + plholder_dict, img_height, img_width, batch_creator, + parsed_q_dict): correct = 0 max_iter = int(math.floor(val_set_size/batch_size)) for i in xrange(max_iter): - region_images, ans_labels, questions, \ + region_images, ans_labels, parsed_q, \ region_score, partition= batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index+i*batch_size, batch_size, + parsed_q_dict, img_height, img_width, 3) - feed_dict = { - placeholders[0] : region_images, - placeholders[1] : questions, - placeholders[2] : 1.0, - placeholders[3] : ans_labels, - placeholders[4] : region_score, - } + feed_dict = ans_io_helper.\ + AnsFeedDictCreator(region_images, ans_labels, parsed_q, + region_score, 1.0, plholder_dict, + vocab).feed_dict correct = correct + accuracy.eval(feed_dict) @@ -163,6 +168,7 @@ def train(train_params): train_anno_filename = train_params['train_json'] test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] regions_anno_filename = train_params['regions_json'] image_dir = train_params['image_dir'] image_regions_dir = train_params['image_regions_dir'] @@ -175,9 +181,11 @@ def train(train_params): os.mkdir(outdir) qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) +# vocab = ans_io_helper.join_vocab(vocab, ans_vocab) # Save region crops if train_params['crop_n_save_regions'] == True: @@ -192,23 +200,27 @@ def train(train_params): # Create graph g = tf.get_default_graph() - image_regions, questions, keep_prob, y, region_score= \ - graph_creator.placeholder_inputs_ans(len(vocab), len(ans_vocab), - mode='gt') + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - # pred_rel_score = graph_creator.rel_comp_graph(image_regions, questions, - # y_pred_obj, y_pred_atr, - # 'q_obj_atr_reg', 1.0, - # len(vocab), batch_size) - pred_rel_score = graph_creator.rel_comp_graph(image_regions, questions, - obj_feat, atr_feat, - 'q_obj_atr_reg', 1.0, - len(vocab), batch_size) + + pred_rel_score = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + 'q_obj_atr_reg_explt', + 1.0, len(vocab), batch_size) # Restore rel, obj and attribute classifier parameters rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') @@ -221,18 +233,23 @@ def train(train_params): rel_saver.restore(sess, rel_model) obj_atr_saver.restore(sess, obj_atr_model) - y_pred = graph_creator.ans_comp_graph(image_regions, questions, keep_prob, - obj_feat, atr_feat, vocab, - inv_vocab, len(ans_vocab), - train_params['mode']) + y_pred = graph_creator.ans_comp_margin_graph(plholder_dict, + obj_feat, atr_feat, + y_pred_obj, y_pred_atr, + vocab, inv_vocab, ans_vocab, + train_params['mode'], True) + + pdb.set_trace() pred_rel_score_vec = tf.reshape(pred_rel_score, [1, batch_size*ans_io_helper.num_proposals]) + y_avg = graph_creator.aggregate_y_pred(y_pred, pred_rel_score_vec, batch_size, ans_io_helper.num_proposals, len(ans_vocab)) cross_entropy = graph_creator.loss(y, y_avg) + #margin_loss = graph_creator.margin_loss(y, y_avg, 1) accuracy = graph_creator.evaluation(y, y_avg) # Collect variables @@ -256,9 +273,11 @@ def train(train_params): vars_dict['ans/fc1/W_obj'], vars_dict['ans/fc1/W_atr'], vars_dict['ans/fc1/W_q'], + vars_dict['ans/fc1/W_explt'], ] - ans_fc2_params = [vars_dict['ans/fc2/W']] + ans_fc2_params = [vars_dict['ans/fc2/W_feat'], + vars_dict['ans/fc2/W_ans']] regularizer_ans_word_vecs = graph_creator \ .regularize_params(ans_word_vec_params) @@ -268,6 +287,11 @@ def train(train_params): regularizer_ans_fcs = graph_creator \ .regularize_params(ans_fc1_params + ans_fc2_params) + # total_loss = margin_loss + \ + # 1e-5 * regularizer_ans_word_vecs + \ + # 1e-5 * regularizer_ans_fcs + \ + # 1e-3 * regularizer_ans_filters + total_loss = cross_entropy + \ 1e-5 * regularizer_ans_word_vecs + \ 1e-5 * regularizer_ans_fcs + \ @@ -295,7 +319,8 @@ def train(train_params): partial_restorer = tf.train.Saver(vars_to_restore) else: start_epoch = 0 - partial_restorer = tf.train.Saver(pretrained_vars) + if train_params['mode']!='q': + partial_restorer = tf.train.Saver(pretrained_vars) # Restore partial model # partial_restorer = tf.train.Saver(vars_to_restore) @@ -324,7 +349,7 @@ def train(train_params): mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ 'Obj_Classifier/mean_image.npy') - placeholders = [image_regions, questions, keep_prob, y, region_score] +# placeholders = [image_regions, questions, keep_prob, y, region_score] # Start Training max_epoch = train_params['max_epoch'] @@ -347,8 +372,9 @@ def train(train_params): vocab, image_regions_dir, mean_image, val_start_id, val_set_size, batch_size, - placeholders, 75, 75, - val_batch_creator) + plholder_dict, 75, 75, + val_batch_creator, + parsed_q_dict) print('Accuracy of restored model: ' + str(restored_accuracy)) # Accuracy filename @@ -360,23 +386,25 @@ def train(train_params): for epoch in range(start_epoch, max_epoch): train_batch_creator.shuffle_ids() for i in range(max_iter): - - train_region_images, train_ans_labels, train_questions, \ + train_region_images, train_ans_labels, train_parsed_q, \ train_region_score, train_partition= train_batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_regions_dir, mean_image, - 1+i*batch_size, batch_size, + 1+i*batch_size, batch_size, + parsed_q_dict, 75, 75, 3) - - feed_dict_train = { - image_regions : train_region_images, - questions: train_questions, - keep_prob: 0.5, - y: train_ans_labels, - region_score: train_region_score, - } + + feed_dict_train = ans_io_helper \ + .AnsFeedDictCreator(train_region_images, + train_ans_labels, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab).feed_dict + _, current_train_batch_acc, y_avg_eval, loss_eval = \ sess.run([train_step, accuracy, y_avg, total_loss], feed_dict=feed_dict_train) @@ -394,8 +422,9 @@ def train(train_params): region_anno_dict, ans_vocab, vocab, image_regions_dir, mean_image, val_start_id, val_set_size_small, - batch_size, placeholders, 75, 75, - val_small_batch_creator) + batch_size, plholder_dict, 75, 75, + val_small_batch_creator, + parsed_q_dict) print('Iter: ' + str(i+1) + ' Val Sm Acc: ' + str(val_accuracy)) @@ -405,8 +434,9 @@ def train(train_params): vocab, image_regions_dir, mean_image, val_start_id, val_set_size, batch_size, - placeholders, 75, 75, - val_batch_creator) + plholder_dict, 75, 75, + val_batch_creator, + parsed_q_dict) print('Val Acc: ' + str(val_acc_array_epoch[epoch]) + ' Train Acc: ' + str(train_acc_array_epoch[epoch])) diff --git a/classifiers/answer_classifier/train_ans_classifier_simple.py b/classifiers/answer_classifier/train_ans_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..79606761f72bdddff69bccd3863050c254d9079f --- /dev/null +++ b/classifiers/answer_classifier/train_ans_classifier_simple.py @@ -0,0 +1,352 @@ +import sys +import os +import json +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import math +import random +import pdb +import tensorflow as tf +import object_classifiers.obj_data_io_helper as obj_data_loader +import attribute_classifiers.atr_data_io_helper as atr_data_loader +import tf_graph_creation_helper as graph_creator +import ans_graph_creator +import rel_graph_creator +import plot_helper as plotter +import ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import time + +val_start_id = 89645 +val_set_size = 5000 +val_set_size_small = 500 + + +def evaluate(accuracy, qa_anno_dict, region_anno_dict, ans_vocab, vocab, + image_dir, mean_image, start_index, val_set_size, batch_size, + plholder_dict, img_height, img_width, batch_creator, + parsed_q_dict): + + correct = 0 + max_iter = int(math.floor(val_set_size/batch_size)) + for i in xrange(max_iter): + region_images, ans_labels, parsed_q, \ + region_score, partition= batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, image_dir, mean_image, + start_index+i*batch_size, batch_size, + parsed_q_dict, + img_height, img_width, 3) + + feed_dict = ans_io_helper.\ + AnsFeedDictCreator(region_images, ans_labels, parsed_q, + region_score, 1.0, plholder_dict, + vocab, False).feed_dict + + correct = correct + accuracy.eval(feed_dict) + + return correct/max_iter + + +def train(train_params): + sess = tf.InteractiveSession() + + train_anno_filename = train_params['train_json'] + test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] + regions_anno_filename = train_params['regions_json'] + image_dir = train_params['image_dir'] + image_regions_dir = train_params['image_regions_dir'] + outdir = train_params['outdir'] + rel_model = train_params['rel_model'] + obj_atr_model = train_params['obj_atr_model'] + batch_size = train_params['batch_size'] + + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) +# vocab = ans_io_helper.join_vocab(vocab, ans_vocab) + + # Save region crops + if train_params['crop_n_save_regions'] == True: + qa_anno_dict_test = ans_io_helper.parse_qa_anno(test_anno_filename) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict, region_anno_dict, + 1, 94644, 75, 75) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict_test, region_anno_dict, + 94645, 143495-94645+1, 75, 75) + + # Create graph + g = tf.get_default_graph() + plholder_dict = graph_creator.placeholder_inputs_ans(len(vocab), + len(ans_vocab), + mode='gt') + + image_regions = plholder_dict['image_regions'] + questions = plholder_dict['questions'] + keep_prob = plholder_dict['keep_prob'] + y = plholder_dict['gt_answer'] + region_score = plholder_dict['region_score'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + + rel_graph = rel_graph_creator.rel_graph_creator(plholder_dict, + 1.0, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + len(vocab), + batch_size, + graph_creator.graph_config, + 'q_obj_atr_reg_explt', + False) + + pred_rel_score = rel_graph.rel_score + + # Restore rel, obj and attribute classifier parameters + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + + obj_atr_saver = tf.train.Saver(obj_vars+atr_vars) + rel_saver = tf.train.Saver(rel_vars) + + rel_saver.restore(sess, rel_model) + obj_atr_saver.restore(sess, obj_atr_model) + + ans_graph = ans_graph_creator.ans_graph_creator(plholder_dict, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + vocab, + inv_vocab, + ans_vocab, + batch_size, + graph_creator.graph_config, + train_params['mode'], + True) + + y_pred = ans_graph.cosine_dist + + pred_rel_score_vec = tf.reshape(pred_rel_score, + [1, batch_size*ans_io_helper.num_proposals]) + + y_avg = graph_creator.aggregate_y_pred(y_pred, + region_score, batch_size, + ans_io_helper.num_proposals, + len(ans_vocab)) + +# cross_entropy = graph_creator.loss(y, y_avg) + total_loss = graph_creator.margin_loss(y, y_avg, 0.1) + accuracy = graph_creator.evaluation(y, y_avg) + + # Collect variables + ans_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='ans') + vars_to_regularize = tf.get_collection('regularize') + + for var in vars_to_regularize: + print var.name + total_loss += 1e-4 * tf.nn.l2_loss(var) + + # Model to restore some of the weights from + if train_params['mode']=='q': + partial_model = '' + + elif train_params['mode']=='q_obj_atr' or \ + train_params['mode']=='q_reg': + partial_model = os.path.join(outdir, 'ans_classifier_q-' + \ + str(train_params['start_model'])) + + elif train_params['mode']=='q_obj_atr_reg': + # partial_model = os.path.join(outdir, 'ans_classifier_q_obj_atr-' + \ + # str(train_params['start_model'])) + partial_model = '' + + # Fine tune begining with a previous model + if train_params['fine_tune']==True: + partial_model = os.path.join(outdir, 'ans_classifier_' + \ + train_params['mode'] + '-' + \ + str(train_params['start_model'])) + start_epoch = train_params['start_model']+1 + + partial_restorer = tf.train.Saver() + else: + start_epoch = 0 + if train_params['mode']!='q': + partial_restorer = tf.train.Saver() + + # Restore partial model + if os.path.exists(partial_model): + partial_restorer.restore(sess, partial_model) + + # Save trained vars + model_saver = tf.train.Saver() + all_vars_without_optim = tf.all_variables() + + # Attach optimization ops + word_vecs = tf.get_collection('variables','ans/word_vecs') + # vars_to_train = [var for var in ans_vars if + # 'ans/word_vecs' not in var.name] + vars_to_train = ans_vars + train_step = tf.train.AdamOptimizer(train_params['adam_lr']) \ + .minimize(total_loss, var_list = vars_to_train) + + # Initialize vars_to_init + all_vars = tf.all_variables() + optimizer_vars = [var for var in all_vars if var not in + all_vars_without_optim] + + print('Optimizer Variables: ') + print([var.name for var in optimizer_vars]) + print('------------------') + + if train_params['mode']=='q': + vars_to_init = ans_vars + optimizer_vars + else: + vars_to_init = ans_vars + optimizer_vars + + sess.run(tf.initialize_variables(vars_to_init)) + + # Load mean image + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Start Training + max_epoch = train_params['max_epoch'] + max_iter = 4400*2 + val_acc_array_epoch = np.zeros([max_epoch]) + train_acc_array_epoch = np.zeros([max_epoch]) + + # Batch creators + train_batch_creator = ans_io_helper.batch_creator(1, max_iter*batch_size) + val_batch_creator = ans_io_helper.batch_creator(val_start_id, val_start_id + + val_set_size - 1) + val_small_batch_creator = ans_io_helper.batch_creator(val_start_id, + val_start_id + + val_set_size_small-1) + + # Check accuracy of restored model + # if train_params['fine_tune']==True: + # restored_accuracy = evaluate(accuracy, qa_anno_dict, + # region_anno_dict, ans_vocab, + # vocab, image_regions_dir, + # mean_image, val_start_id, + # val_set_size, batch_size, + # plholder_dict, 75, 75, + # val_batch_creator, + # parsed_q_dict) + # print('Accuracy of restored model: ' + str(restored_accuracy)) + + # Accuracy filename + train_accuracy_txtfile = os.path.join(outdir,'train_accuracy_' + \ + train_params['mode'] + '.txt') + val_accuracy_txtfile = os.path.join(outdir,'val_accuracy_' + \ + train_params['mode'] + '.txt') + + for epoch in range(start_epoch, max_epoch): + train_batch_creator.shuffle_ids() + for i in range(max_iter): + train_region_images, train_ans_labels, train_parsed_q, \ + train_region_score, train_partition= train_batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + 1+i*batch_size, batch_size, + parsed_q_dict, + 75, 75, 3) + + feed_dict_train = ans_io_helper \ + .AnsFeedDictCreator(train_region_images, + train_ans_labels, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab, + True).feed_dict + + _, current_train_batch_acc, y_avg_eval, loss_eval = \ + sess.run([train_step, accuracy, y_avg, total_loss], + feed_dict=feed_dict_train) + + # print(y_avg_eval[0,:]) + # print(train_ans_labels[0,:]) + +# rel_logits = g.get_operation_by_name('rel/fc2/vec_logits') +# print(rel_logits.outputs[0].eval(feed_dict_train)) +# print (pred_rel_score.eval(feed_dict_train)) + + assert (not np.any(np.isnan(y_avg_eval))), 'NaN predicted' + + train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] + \ + current_train_batch_acc + + if (i+1)%500==0: + val_accuracy = evaluate(accuracy, qa_anno_dict, + region_anno_dict, ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size_small, + batch_size, plholder_dict, 75, 75, + val_small_batch_creator, + parsed_q_dict) + + print('Iter: ' + str(i+1) + ' Val Sm Acc: ' + str(val_accuracy)) + + train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] / max_iter + val_acc_array_epoch[epoch] = evaluate(accuracy, qa_anno_dict, + region_anno_dict, ans_vocab, + vocab, image_regions_dir, + mean_image, val_start_id, + val_set_size, batch_size, + plholder_dict, 75, 75, + val_batch_creator, + parsed_q_dict) + + print('Val Acc: ' + str(val_acc_array_epoch[epoch]) + + ' Train Acc: ' + str(train_acc_array_epoch[epoch])) + + + if train_params['fine_tune']==True: + plot_path = os.path.join(outdir, 'acc_vs_epoch_' \ + + train_params['mode'] + '_fine_tuned.pdf') + else: + plot_path = os.path.join(outdir, 'acc_vs_epoch_' \ + + train_params['mode'] + '.pdf') + + plotter.write_accuracy_to_file(start_epoch, epoch, + train_acc_array_epoch, + train_params['fine_tune'], + train_accuracy_txtfile) + plotter.write_accuracy_to_file(start_epoch, epoch, + val_acc_array_epoch, + train_params['fine_tune'], + val_accuracy_txtfile) + plotter.plot_accuracies(xdata=np.arange(0, epoch + 1) + 1, + ydata_train=train_acc_array_epoch[0:epoch + 1], + ydata_val=val_acc_array_epoch[0:epoch + 1], + xlim=[1, max_epoch], ylim=[0, 1.0], + savePath=plot_path) + + save_path = model_saver.save(sess, + os.path.join(outdir, 'ans_classifier_' + \ + train_params['mode']), global_step=epoch) + + sess.close() + tf.reset_default_graph() + +if __name__=='__main__': + print 'Hello' diff --git a/classifiers/inherit_example.py b/classifiers/inherit_example.py new file mode 100644 index 0000000000000000000000000000000000000000..095345a194f8c76f04c1065b7aae4dbccf138bd3 --- /dev/null +++ b/classifiers/inherit_example.py @@ -0,0 +1,14 @@ +class baseclass(): + def __init__(self, a): + print a + + def baseMethod(self): + print 'Yeah inheritance' + +class derivedclass(baseclass): + def __init__(self, a, b): + baseclass.__init__(self, a) + print b + self.baseMethod() + +a = derivedclass(1,2) diff --git a/classifiers/region_ranker/eval_rel_classifier.py b/classifiers/region_ranker/eval_rel_classifier.py index 49046f51ad030decc8f96ccbce40c5e11a47c57c..0cc92cb8e68b05a23417d61142cdf0ddfca1b680 100644 --- a/classifiers/region_ranker/eval_rel_classifier.py +++ b/classifiers/region_ranker/eval_rel_classifier.py @@ -17,6 +17,7 @@ def eval(eval_params): sess = tf.InteractiveSession() train_anno_filename = eval_params['train_json'] test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] regions_anno_filename = eval_params['regions_json'] image_regions_dir = eval_params['image_regions_dir'] outdir = eval_params['outdir'] @@ -33,6 +34,7 @@ def eval(eval_params): qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) @@ -40,24 +42,24 @@ def eval(eval_params): # Create graph g = tf.get_default_graph() - image_regions, questions, y, keep_prob = \ + plholder_dict = \ graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, len(vocab), mode='gt') - placeholders = [image_regions, questions, y, keep_prob] + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - y_pred = graph_creator.rel_comp_graph(image_regions, questions, + y_pred = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, mode, keep_prob, len(vocab), batch_size) - # y_pred = graph_creator.rel_comp_graph(image_regions, questions, - # obj_feat, atr_feat, mode, - # keep_prob, len(vocab), batch_size) - # Restore model restorer = tf.train.Saver() if os.path.exists(model): @@ -76,11 +78,11 @@ def eval(eval_params): # Test Recall test_recall = rel_trainer.evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, - vocab, image_regions_dir, - mean_image, test_start_id, - test_set_size, batch_size, - placeholders, 75, 75, - test_batch_creator,verbose=True) + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + test_start_id, test_set_size, + batch_size, plholder_dict, + 75, 75, test_batch_creator,verbose=True) print('Test Rec: ' + str(test_recall)) diff --git a/classifiers/region_ranker/eval_rel_classifier_simple.py b/classifiers/region_ranker/eval_rel_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..b292610a5eceb9386e0695324982de157c034fb2 --- /dev/null +++ b/classifiers/region_ranker/eval_rel_classifier_simple.py @@ -0,0 +1,118 @@ +import sys +import os +import json +import math +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import numpy as np +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +#import region_ranker.train_rel_classifier as rel_trainer +import region_ranker.train_rel_classifier_simple as rel_trainer +import tf_graph_creation_helper as graph_creator +import rel_graph_creator +import plot_helper as plotter + +def eval(eval_params): + sess = tf.InteractiveSession() + train_anno_filename = eval_params['train_json'] + test_anno_filename = eval_params['test_json'] + parsed_q_filename = eval_params['parsed_q_json'] + regions_anno_filename = eval_params['regions_json'] + whole_image_dir = eval_params['image_dir'] + image_regions_dir = eval_params['image_regions_dir'] + outdir = eval_params['outdir'] + model_basedir = eval_params['model_basedir'] + model_number = eval_params['model_number'] + mode = eval_params['mode'] + batch_size = eval_params['batch_size'] + test_start_id = eval_params['test_start_id'] + test_set_size = eval_params['test_set_size'] + model = os.path.join(model_basedir, 'rel_classifier_' + mode + \ + '-' + str(model_number)) + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict_train = ans_io_helper.parse_qa_anno(train_anno_filename) + qa_anno_dict = ans_io_helper.parse_qa_anno(test_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict_train) + + + # Create graph + g = tf.get_default_graph() + plholder_dict = \ + graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, + len(vocab), mode='gt') + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + rel_graph = rel_graph_creator.rel_graph_creator(plholder_dict, + keep_prob, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + len(vocab), + batch_size, + graph_creator.graph_config, + mode, + False) + y_pred = rel_graph.rel_score + + # Restore model + restorer = tf.train.Saver() + if os.path.exists(model): + restorer.restore(sess, model) + else: + print 'Failed to read model from file ' + model + + # Load mean image + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Batch creator + test_batch_creator = ans_io_helper.batch_creator(test_start_id, + test_start_id + + test_set_size - 1) + + # Test Recall + # test_recall = rel_trainer.evaluate(y_pred, qa_anno_dict, + # region_anno_dict, parsed_q_dict, + # ans_vocab, vocab, + # image_regions_dir, mean_image, + # test_start_id, test_set_size, + # batch_size, plholder_dict, + # 75, 75, test_batch_creator,verbose=True) + + html_dir = os.path.join(outdir,'rel_html') + test_recall = rel_trainer.evaluate_with_vis(y_pred, + qa_anno_dict, + region_anno_dict, + parsed_q_dict, + ans_vocab, + vocab, + image_regions_dir, + mean_image, + test_start_id, + test_set_size, + batch_size, + plholder_dict, + 75, + 75, + test_batch_creator, + html_dir, + whole_image_dir, + verbose=True) + print('Test Rec: ' + str(test_recall)) diff --git a/classifiers/region_ranker/perfect_ranker.py b/classifiers/region_ranker/perfect_ranker.py index 2dd92d5c86111b467378c1558520de572320bedc..2caa63a632a19c78931af53c6fe1fc097252fd4e 100644 --- a/classifiers/region_ranker/perfect_ranker.py +++ b/classifiers/region_ranker/perfect_ranker.py @@ -51,7 +51,7 @@ def get_region_coords(img_height, img_width): print(region_coords) return region_coords, region_coords_ -def rank_regions(image, question, region_coords, region_coords_, +def rank_regions2(image, question, region_coords, region_coords_, gt_regions_for_image, crop=True): num_regions, _ = region_coords.shape @@ -83,6 +83,7 @@ def rank_regions(image, question, region_coords, region_coords_, no_regions_flag = True else: for gt_region in gt_regions_for_image: + gt_x1, gt_y1, gt_x2, gt_y2 = gt_regions_for_image[gt_region] if gt_x1==x1_ and gt_x2==x2_ and gt_y1==y1_ and \ gt_y2==y2_ and gt_region in question: @@ -102,6 +103,99 @@ def rank_regions(image, question, region_coords, region_coords_, return regions +def rank_regions(image, question, region_coords, region_coords_, + gt_regions_for_image, crop=True): + + num_regions, _ = region_coords.shape + regions = dict() + coord_list = [] + no_regions_flag = False + if question is not None: + if 'How many' in question: + no_regions_flag = True + elif 'What color' in question: + split_question = question.split(" ") + gt_region = split_question[-1] + gt_region = gt_region[:-1] + if gt_region not in gt_regions_for_image: + no_regions_flag = True + else: + coord_list.append(gt_regions_for_image[gt_region]) + elif 'below' in question: + split_question = question.split(" ") + gt_region = " ".join(split_question[3:5]) + if gt_region not in gt_regions_for_image: + no_regions_flag = True + else: + coord_list.append(gt_regions_for_image[gt_region]) + gt_region = " ".join(split_question[7:9]) + gt_region = gt_region[:-1] + if gt_region not in gt_regions_for_image: + no_regions_flag = True + else: + coord_list.append(gt_regions_for_image[gt_region]) + gt_region = " ".join(split_question[3:9]) + gt_region = gt_region[:-1] + if gt_region not in gt_regions_for_image: + no_regions_flag = True + else: + coord_list.append(gt_regions_for_image[gt_region]) + elif 'Is there' in question: + split_question = question.split(" ") + gt_region = " ".join(split_question[3:5]) + gt_region = gt_region[:-1] + if gt_region not in gt_regions_for_image: + no_regions_flag = True + else: + coord_list.append(gt_regions_for_image[gt_region]) + + num_gt_regions = len(coord_list) + for i in xrange(num_regions): + x1_ = region_coords_[i,0] + y1_ = region_coords_[i,1] + x2_ = region_coords_[i,2] + y2_ = region_coords_[i,3] + + x1 = region_coords[i,0] + y1 = region_coords[i,1] + x2 = region_coords[i,2] + y2 = region_coords[i,3] + + if crop: + cropped_image = image[y1-1:y2, x1-1:x2, :] + else: + cropped_image = None + + score = 0.0 + if no_regions_flag: + score = 1.0/num_regions + else: + for coord in coord_list: + gt_x1, gt_y1, gt_x2, gt_y2 = coord + if gt_x1==x1_ and gt_x2==x2_ and gt_y1==y1_ and gt_y2==y2_: + score = 1.0/num_gt_regions + break; + + regions[i] = region(image=cropped_image, score=score, + coord=region_coords[i,:]) + + return regions + +def get_rel_map(image, scores, region_coords): + num_regions = region_coords.shape[0] + h, w, c = image.shape + rel_map = np.zeros(shape=[h, w, num_regions], dtype=np.float32) + for i in xrange(num_regions): + x1 = region_coords[i,0] + y1 = region_coords[i,1] + x2 = region_coords[i,2] + y2 = region_coords[i,3] + rel_map[y1-1:y2, x1-1:x2, i] = scores[i] + + rel_map = rel_map.max(axis=2) + rel_map = rel_map + return rel_map + if __name__=='__main__': image_dir = '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images/' diff --git a/classifiers/region_ranker/train_rel_classifier.py b/classifiers/region_ranker/train_rel_classifier.py index de0f0461bb20380ec215f487ad0e2307035d5e26..6ae315b94099acbba0b66229ab17458d1aafe20e 100644 --- a/classifiers/region_ranker/train_rel_classifier.py +++ b/classifiers/region_ranker/train_rel_classifier.py @@ -39,34 +39,38 @@ def batch_recall(pred_scores, gt_scores, k): return batch_recall -def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, ans_vocab, vocab, - image_dir, mean_image, start_index, val_set_size, batch_size, - placeholders, img_height, img_width, batch_creator, verbose=False): +def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, parsed_q_dict, + ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, + batch_size, plholder_dict, img_height, img_width, batch_creator, + verbose=False): recall_at_k = 0 max_iter = int(math.floor(val_set_size/batch_size)) for i in xrange(max_iter): if verbose==True: print('Iter: ' + str(i+1) + '/' + str(max_iter)) - region_images, ans_labels, questions, \ - region_score_vec, partition= batch_creator \ + region_images, ans_labels, parsed_q, \ + region_scores_vec, partition= batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, start_index+i*batch_size, batch_size, + parsed_q_dict, img_height, img_width, 3) - region_score = batch_creator.reshape_score(region_score_vec) + region_scores = batch_creator.reshape_score(region_scores_vec) - feed_dict = { - placeholders[0] : region_images, - placeholders[1] : questions, - placeholders[2] : region_score, - placeholders[3] : 1.0, - } + feed_dict = ans_io_helper \ + .RelFeedDictCreator(region_images, + parsed_q, + region_scores, + 1.0, + plholder_dict, + vocab).feed_dict region_score_pred_eval = region_score_pred.eval(feed_dict) + print region_score_pred_eval recall_at_k += batch_recall(region_score_pred_eval, - region_score, -1) + region_scores, -1) recall_at_k /= max_iter @@ -77,6 +81,7 @@ def train(train_params): sess = tf.InteractiveSession() train_anno_filename = train_params['train_json'] test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] regions_anno_filename = train_params['regions_json'] image_dir = train_params['image_dir'] image_regions_dir = train_params['image_regions_dir'] @@ -89,6 +94,7 @@ def train(train_params): os.mkdir(outdir) qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) @@ -106,24 +112,24 @@ def train(train_params): # Create graph g = tf.get_default_graph() - image_regions, questions, y, keep_prob = \ + plholder_dict = \ graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, len(vocab), mode='gt') - placeholders = [image_regions, questions, y, keep_prob] + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') obj_feat = obj_feat_op.outputs[0] y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') atr_feat = atr_feat_op.outputs[0] - y_pred = graph_creator.rel_comp_graph(image_regions, questions, + y_pred = graph_creator.rel_comp_graph(plholder_dict, + obj_feat, atr_feat, y_pred_obj, y_pred_atr, mode, keep_prob, len(vocab), batch_size) - # y_pred = graph_creator.rel_comp_graph(image_regions, questions, - # obj_feat, atr_feat, mode, - # keep_prob, len(vocab), batch_size) - accuracy = graph_creator.evaluation(y, y_pred) cross_entropy = graph_creator.loss(y, y_pred) @@ -139,6 +145,7 @@ def train(train_params): 'rel/fc1/W_q', 'rel/fc1/W_obj', 'rel/fc1/W_atr', + 'rel/fc1/W_explt', 'rel/fc1/b', 'rel/fc2/W', 'rel/fc2/b', @@ -161,6 +168,7 @@ def train(train_params): vars_dict['rel/fc1/W_q'], vars_dict['rel/fc1/W_obj'], vars_dict['rel/fc1/W_atr'], + vars_dict['rel/fc1/W_explt'], vars_dict['rel/fc2/W'], ] @@ -244,12 +252,12 @@ def train(train_params): # Check accuracy of restored model if train_params['fine_tune']==True: - restored_recall = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, + restored_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, val_start_id, val_set_size, batch_size, - placeholders, 75, 75, + plholder_dict, 75, 75, val_batch_creator) print('Recall of restored model: ' + str(restored_recall)) @@ -261,23 +269,26 @@ def train(train_params): train_batch_creator.shuffle_ids() for i in range(max_iter): - train_region_images, train_ans_labels, train_questions, \ + train_region_images, train_ans_labels, train_parsed_q, \ train_region_score_vec, train_partition= train_batch_creator \ .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_regions_dir, mean_image, - 1+i*batch_size, batch_size, + 1+i*batch_size, batch_size, + parsed_q_dict, 75, 75, 3) + train_region_score = train_batch_creator \ .reshape_score(train_region_score_vec) - feed_dict_train = { - image_regions : train_region_images, - questions: train_questions, - keep_prob: 0.5, - y: train_region_score, - } - + feed_dict_train = ans_io_helper \ + .RelFeedDictCreator(train_region_images, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab).feed_dict + _, current_train_batch_acc, y_pred_eval, loss_eval = \ sess.run([train_step, accuracy, y_pred, total_loss], feed_dict=feed_dict_train) @@ -289,23 +300,23 @@ def train(train_params): train_region_score, -1) if (i+1)%500==0: - val_recall = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, vocab, + val_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, image_regions_dir, mean_image, val_start_id, val_set_size_small, - batch_size, placeholders, 75, 75, + batch_size, plholder_dict, 75, 75, val_small_batch_creator) print('Iter: ' + str(i+1) + ' Val Sm Rec: ' + str(val_recall)) train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] / max_iter val_rec_array_epoch[epoch] = evaluate(y_pred, qa_anno_dict, - region_anno_dict, ans_vocab, - vocab, image_regions_dir, - mean_image, val_start_id, - val_set_size, batch_size, - placeholders, 75, 75, - val_batch_creator) + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size, + batch_size, plholder_dict, + 75, 75, val_batch_creator) print('Val Rec: ' + str(val_rec_array_epoch[epoch]) + ' Train Rec: ' + str(train_rec_array_epoch[epoch])) diff --git a/classifiers/region_ranker/train_rel_classifier_simple.py b/classifiers/region_ranker/train_rel_classifier_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..4c2d77d5962f5e13d63e1760ea48485d72587511 --- /dev/null +++ b/classifiers/region_ranker/train_rel_classifier_simple.py @@ -0,0 +1,394 @@ +import sys +import os +import json +import math +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import scipy.misc +import numpy as np +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +import region_ranker.perfect_ranker as region_proposer +import tf_graph_creation_helper as graph_creator +import rel_graph_creator +import plot_helper as plotter + +val_start_id = 89645 +val_set_size = 5000 +val_set_size_small = 500 + +def recall(pred_scores, gt_scores, k): + inc_order = np.argsort(pred_scores, 0) + dec_order = inc_order[::-1] + gt_scores_ordered = gt_scores[dec_order] + rel_reg_recalled = np.sum(gt_scores_ordered[0:k]!=0) + rel_reg = np.sum(gt_scores!=0) + return rel_reg_recalled/(rel_reg+0.00001) + + +def batch_recall(pred_scores, gt_scores, k): + batch_size = pred_scores.shape[0] + batch_recall = 0.0 + for i in xrange(batch_size): + if k==-1: + k_ = np.sum(gt_scores[i,:]!=0) + else: + k_ = k + batch_recall += recall(pred_scores[i,:], gt_scores[i,:], k_) + + batch_recall = batch_recall/batch_size + + return batch_recall + + +def evaluate(region_score_pred, qa_anno_dict, region_anno_dict, parsed_q_dict, + ans_vocab, vocab, image_dir, mean_image, start_index, val_set_size, + batch_size, plholder_dict, img_height, img_width, batch_creator, + verbose=False): + + recall_at_k = 0 + max_iter = int(math.floor(val_set_size/batch_size)) + for i in xrange(max_iter): + if verbose==True: + print('Iter: ' + str(i+1) + '/' + str(max_iter)) + region_images, ans_labels, parsed_q, \ + region_scores_vec, partition= batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, image_dir, mean_image, + start_index+i*batch_size, batch_size, + parsed_q_dict, + img_height, img_width, 3) + region_scores = batch_creator.reshape_score(region_scores_vec) + + feed_dict = ans_io_helper \ + .RelFeedDictCreator(region_images, + parsed_q, + region_scores, + 1.0, + plholder_dict, + vocab, + False).feed_dict + + region_score_pred_eval = region_score_pred.eval(feed_dict) + + recall_at_k += batch_recall(region_score_pred_eval, + region_scores, -1) + + recall_at_k /= max_iter + + return recall_at_k + +def evaluate_with_vis(region_score_pred, + qa_anno_dict, + region_anno_dict, + parsed_q_dict, + ans_vocab, + vocab, + image_dir, + mean_image, + start_index, + val_set_size, + batch_size, + plholder_dict, + img_height, + img_width, + batch_creator, + html_dir, + whole_image_dir, + verbose=False): + + if not os.path.exists(html_dir): + os.mkdir(html_dir) + + html_filename = os.path.join(html_dir,'index.html') + html_writer = ans_io_helper.html_ans_table_writer(html_filename) + + recall_at_k = 0 + max_iter = int(math.floor(val_set_size/batch_size)) + + for i in xrange(max_iter): + if verbose==True: + print('Iter: ' + str(i+1) + '/' + str(max_iter)) + region_images, ans_labels, parsed_q, \ + region_scores_vec, partition= batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, image_dir, mean_image, + start_index+i*batch_size, batch_size, + parsed_q_dict, + img_height, img_width, 3) + region_scores = batch_creator.reshape_score(region_scores_vec) + + feed_dict = ans_io_helper \ + .RelFeedDictCreator(region_images, + parsed_q, + region_scores, + 1.0, + plholder_dict, + vocab, + False).feed_dict + + region_score_pred_eval = region_score_pred.eval(feed_dict) + print region_score_pred_eval.shape + recall_at_k += batch_recall(region_score_pred_eval, + region_scores, -1) + + q_ids = batch_creator.qa_index(start_index+i*batch_size, batch_size) + for j in xrange(batch_size): + q_id = q_ids[j] + image_id = qa_anno_dict[q_id].image_id + question = qa_anno_dict[q_id].question + answer = qa_anno_dict[q_id].answer + image = mpimg.imread(os.path.join(whole_image_dir, + '{}.jpg'.format(image_id))) + rel_map = region_proposer.get_rel_map(image, + region_score_pred_eval[j,:], + ans_io_helper.region_coords_) + rel_map_stacked = np.dstack((rel_map, rel_map, rel_map)) + + image = np.multiply(image, rel_map_stacked) + \ + np.multiply(0*image+255, 1-rel_map_stacked) + + image_filename = os.path.join(html_dir, + str(image_id) + '_' + \ + str(q_id) + '.jpg') + scipy.misc.imsave(image_filename, image.astype(np.uint8)) + col_dict = { + 0: q_id, + 1: question, + 2: answer, + 3: html_writer.image_tag(str(image_id) + '_' + \ + str(q_id) + '.jpg', 50, 50), + } + html_writer.add_element(col_dict) + + html_writer.close_file() + recall_at_k /= max_iter + + return recall_at_k + + +def train(train_params): + sess = tf.InteractiveSession() + train_anno_filename = train_params['train_json'] + test_anno_filename = train_params['test_json'] + parsed_q_filename = train_params['parsed_q_json'] + regions_anno_filename = train_params['regions_json'] + image_dir = train_params['image_dir'] + image_regions_dir = train_params['image_regions_dir'] + outdir = train_params['outdir'] + batch_size = train_params['batch_size'] + obj_atr_model = train_params['obj_atr_model'] + mode = train_params['mode'] + + if not os.path.exists(outdir): + os.mkdir(outdir) + + qa_anno_dict = ans_io_helper.parse_qa_anno(train_anno_filename) + parsed_q_dict = ans_io_helper.read_parsed_questions(parsed_q_filename) + region_anno_dict = region_proposer.parse_region_anno(regions_anno_filename) + ans_vocab, inv_ans_vocab = ans_io_helper.create_ans_dict() + vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) + + # Save region crops + if train_params['crop_n_save_regions'] == True: + qa_anno_dict_test = ans_io_helper.parse_qa_anno(test_anno_filename) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict, region_anno_dict, + 1, 94644, 75, 75) + ans_io_helper.save_regions(image_dir, image_regions_dir, + qa_anno_dict_test, region_anno_dict, + 94645, 143495-94645+1, 75, 75) + + + # Create graph + g = tf.get_default_graph() + plholder_dict = \ + graph_creator.placeholder_inputs_rel(ans_io_helper.num_proposals, + len(vocab), mode='gt') + image_regions = plholder_dict['image_regions'] + y = plholder_dict['gt_scores'] + keep_prob = plholder_dict['keep_prob'] + + y_pred_obj = graph_creator.obj_comp_graph(image_regions, 1.0) + obj_feat_op = g.get_operation_by_name('obj/conv2/obj_feat') + obj_feat = obj_feat_op.outputs[0] + y_pred_atr = graph_creator.atr_comp_graph(image_regions, 1.0, obj_feat) + atr_feat_op = g.get_operation_by_name('atr/conv2/atr_feat') + atr_feat = atr_feat_op.outputs[0] + rel_graph = rel_graph_creator.rel_graph_creator(plholder_dict, + keep_prob, + obj_feat, + atr_feat, + y_pred_obj, + y_pred_atr, + len(vocab), + batch_size, + graph_creator.graph_config, + mode, + True) + + y_pred = rel_graph.rel_score + + accuracy = graph_creator.evaluation(y, y_pred) + + cross_entropy = graph_creator.loss(y, y_pred) + + # Regularization + vars_to_regularize = tf.get_collection('regularize') + + total_loss = cross_entropy + for var in vars_to_regularize: + print var.name + total_loss += 1e-4 * tf.nn.l2_loss(var) + + # Restore weights + obj_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='obj') + atr_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='atr') + rel_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='rel') + + obj_atr_restorer = tf.train.Saver(obj_vars + atr_vars) + obj_atr_restorer.restore(sess, obj_atr_model) + + # Model to save and restore weights from + model_saver = tf.train.Saver() + + + if train_params['fine_tune']==True: + pretrained_model = os.path.join(outdir, 'rel_classifier_' + mode +'-'+ \ + str(train_params['start_model'])) + assert (os.path.exists(pretrained_model)), \ + 'Pretrained model does not exist' + model_saver.restore(sess, pretrained_model) + start_epoch = train_params['start_model'] + 1 + else: + assert (os.path.exists(obj_atr_model)), \ + 'Obj_Atr model does not exist' + start_epoch = 0 + + # Attach optimization ops + all_vars_without_optim = tf.all_variables() + vars_to_train = rel_vars + train_step = tf.train.AdamOptimizer(train_params['adam_lr']) \ + .minimize(total_loss, var_list=vars_to_train) + + # Initialize uninitialized vars + all_vars = tf.all_variables() + optimizer_vars = [var for var in all_vars if var not in + all_vars_without_optim] + + if train_params['fine_tune']: + vars_to_init = optimizer_vars + else: + vars_to_init = optimizer_vars + rel_vars + sess.run(tf.initialize_variables(vars_to_init)) + + # Load mean image + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') + + # Start Training + max_epoch = train_params['max_epoch'] + max_iter = 4400*2 + val_rec_array_epoch = np.zeros([max_epoch]) + train_rec_array_epoch = np.zeros([max_epoch]) + + # Batch creators + train_batch_creator = ans_io_helper.batch_creator(1, max_iter*batch_size) + val_batch_creator = ans_io_helper.batch_creator(val_start_id, val_start_id + + val_set_size - 1) + val_small_batch_creator = ans_io_helper.batch_creator(val_start_id, + val_start_id + + val_set_size_small-1) + + # Check accuracy of restored model + if train_params['fine_tune']==True: + restored_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, + vocab, image_regions_dir, + mean_image, val_start_id, + val_set_size, batch_size, + plholder_dict, 75, 75, + val_batch_creator) + print('Recall of restored model: ' + str(restored_recall)) + + # Accuracy filename + train_recall_txtfile = os.path.join(outdir,'train_recall_'+ mode +'.txt') + val_recall_txtfile = os.path.join(outdir,'val_recall_'+ mode +'.txt') + + for epoch in range(start_epoch, max_epoch): + train_batch_creator.shuffle_ids() + for i in range(max_iter): + + train_region_images, train_ans_labels, train_parsed_q, \ + train_region_score_vec, train_partition= train_batch_creator \ + .ans_mini_batch_loader(qa_anno_dict, region_anno_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + 1+i*batch_size, batch_size, + parsed_q_dict, + 75, 75, 3) + + train_region_score = train_batch_creator \ + .reshape_score(train_region_score_vec) + + feed_dict_train = ans_io_helper \ + .RelFeedDictCreator(train_region_images, + train_parsed_q, + train_region_score, + 0.5, + plholder_dict, + vocab, + True).feed_dict + + _, current_train_batch_acc, y_pred_eval, loss_eval = \ + sess.run([train_step, accuracy, y_pred, total_loss], + feed_dict=feed_dict_train) + + assert (not np.any(np.isnan(y_pred_eval))), 'NaN predicted' + + train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] + \ + batch_recall(y_pred_eval, + train_region_score, -1) + + if (i+1)%500==0: + val_recall = evaluate(y_pred, qa_anno_dict, region_anno_dict, + parsed_q_dict, ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size_small, + batch_size, plholder_dict, 75, 75, + val_small_batch_creator) + + print('Iter: ' + str(i+1) + ' Val Sm Rec: ' + str(val_recall)) + + train_rec_array_epoch[epoch] = train_rec_array_epoch[epoch] / max_iter + val_rec_array_epoch[epoch] = evaluate(y_pred, qa_anno_dict, + region_anno_dict, parsed_q_dict, + ans_vocab, vocab, + image_regions_dir, mean_image, + val_start_id, val_set_size, + batch_size, plholder_dict, + 75, 75, val_batch_creator) + + print('Val Rec: ' + str(val_rec_array_epoch[epoch]) + + ' Train Rec: ' + str(train_rec_array_epoch[epoch])) + + + plotter.write_accuracy_to_file(start_epoch, epoch, + train_rec_array_epoch, + train_params['fine_tune'], + train_recall_txtfile) + plotter.write_accuracy_to_file(start_epoch, epoch, + val_rec_array_epoch, + train_params['fine_tune'], + val_recall_txtfile) + + save_path = model_saver.save(sess, + os.path.join(outdir, 'rel_classifier_' + \ + mode), + global_step=epoch) + + sess.close() + tf.reset_default_graph() + + diff --git a/classifiers/rel_graph_creator.py b/classifiers/rel_graph_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..8ddaa2ca797d033cbdd48bc0b798ca1e82215ee2 --- /dev/null +++ b/classifiers/rel_graph_creator.py @@ -0,0 +1,191 @@ +import numpy as np +import math +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +from tf_graph_creation_helper import weight_variable, bias_variable, \ + q_bin_embed_graph, conv2d, max_pool_2x2, batchnorm, explicit_feat_graph + + +class rel_graph_creator(): + def __init__(self, + plholder_dict, + keep_prob, + obj_feat, + atr_feat, + obj_prob, + atr_prob, + vocab_size, + batch_size, + graph_config, + mode='q_obj_atr', + is_train=True): + + self.mode = mode + self.is_train = plholder_dict['is_train'] + self.keep_prob = keep_prob + image_regions = plholder_dict['image_regions'] + + with tf.name_scope('rel') as rel_graph: + # Word Vectors + word_vecs = self.create_word_vecs(vocab_size, + graph_config['word_vec_dim']) + + # Feature Computations + q_feat = self.add_q_feat_comp_layer(word_vecs, plholder_dict) + reg_feat = self.add_reg_feat_comp_layer(image_regions) + explt_feat = self.add_explt_feat_comp_layer(obj_prob, + atr_prob, + plholder_dict) + + # Feature Projections (with batch norm) + feat_proj_dim = graph_config['joint_embed_dim'] + proj_feat = dict() + + proj_feat['q'] = self.feat_proj_layer(q_feat, feat_proj_dim, + 'q_feat_proj_layer') + + proj_feat['reg'] = self.feat_proj_layer(reg_feat, feat_proj_dim, + 'reg_feat_proj_layer') + + proj_feat['obj'] = self.feat_proj_layer(obj_feat, feat_proj_dim, + 'obj_feat_proj_layer') + + proj_feat['atr'] = self.feat_proj_layer(atr_feat, feat_proj_dim, + 'atr_feat_proj_layer') + + proj_feat['explt'] = self.feat_proj_layer(explt_feat, feat_proj_dim, + 'explt_feat_proj_layer') + + # Feature Combination + coeffs = self.mixing_coeffs() + print coeffs + num_regions = batch_size*ans_io_helper.num_proposals + comb_feat = tf.zeros(shape=[num_regions, feat_proj_dim], + dtype=tf.float32) + for feat_type, feat in proj_feat.items(): + comb_feat = comb_feat + feat * coeffs[feat_type] + + bn_comb_feat = batchnorm(comb_feat, None, self.is_train) + + # Softmax scores + self.rel_score = self.softmax_layer(tf.nn.relu(bn_comb_feat), + batch_size, + ans_io_helper.num_proposals) + + def create_word_vecs(self, vocab_size, word_vec_dim): + word_vecs = weight_variable([vocab_size, + word_vec_dim], + var_name='word_vecs') + word_vecs = tf.nn.l2_normalize(word_vecs, 1) + tf.add_to_collection('regularize',word_vecs) + return word_vecs + + def add_q_feat_comp_layer(self, word_vecs, plholder_dict): + with tf.name_scope('q_feat_comp_layer') as q_feat_comp_layer: + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, + [bin0_embed, bin1_embed, bin2_embed, bin3_embed], + name='q_feat') + return q_feat + + def add_reg_feat_comp_layer(self, image_regions): + with tf.name_scope('reg_feat_comp_layer') as reg_feat_comp_layer: + with tf.name_scope('conv1') as conv1: + W_conv1 = weight_variable([5,5,3,4]) + b_conv1 = bias_variable([4]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), + b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, self.keep_prob, + name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + W_conv2 = weight_variable([3,3,4,8]) + b_conv2 = bias_variable([8]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, self.keep_prob, + name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + reg_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in + h_pool2_drop_shape[1:]]) + reg_feat = tf.reshape(h_pool2_drop, [-1, reg_feat_dim], + name='reg_feat') + + tf.add_to_collection('regularize', W_conv1) + tf.add_to_collection('regularize', W_conv2) + + return reg_feat + + def add_explt_feat_comp_layer(self, obj_prob, atr_prob, plholder_dict): + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + return concat_explt_feat + + def feat_proj_layer(self, feat, proj_dim, name_scope): + with tf.name_scope(name_scope) as fc_layer: + feat_dim = feat.get_shape()[1].value + W1 = weight_variable([feat_dim, proj_dim]) + b1 = bias_variable([proj_dim]) + proj_feat1 = tf.add(tf.matmul(feat, W1), b1) + bn_proj_feat1 = batchnorm(proj_feat1, None, self.is_train) + W2 = weight_variable([proj_dim, proj_dim]) + b2 = bias_variable([proj_dim]) + bn_proj_feat2 = tf.add(tf.matmul(tf.nn.relu(bn_proj_feat1), W2), b2) + + tf.add_to_collection('regularize', W1) + tf.add_to_collection('regularize', W2) + + return bn_proj_feat2 + + def mixing_coeffs(self): + feat_types = ['q', 'obj', 'atr', 'reg', 'explt'] + coeffs = dict() + count = 0; + for feat_type in feat_types: + if feat_type in self.mode: + if feat_type is 'explt': + coeffs[feat_type] = 1.0 + count += 1 + else: + coeffs[feat_type] = 1.0 + count += 1 + else: + coeffs[feat_type] = 0.0 + coeffs = {k: v/count for k, v in coeffs.items()} + return coeffs + + def softmax_layer(self, feats, batch_size, num_proposals): + feat_dim = feats.get_shape()[1].value + with tf.name_scope('softmax_layer') as softmax_layer: + W = weight_variable([feat_dim, 1]) + b = bias_variable([1]) + + vec_logits = tf.add(tf.matmul(feats, W), b, + name='vec_logits') + + logits = tf.reshape(vec_logits, + [batch_size, num_proposals]) + + y_pred = tf.nn.softmax(logits, name='softmax') + + tf.add_to_collection('regularize', W) + + return y_pred diff --git a/classifiers/rel_graph_creator2.py b/classifiers/rel_graph_creator2.py new file mode 100644 index 0000000000000000000000000000000000000000..a17f4491c46269711b89b51fff343dc52537baf9 --- /dev/null +++ b/classifiers/rel_graph_creator2.py @@ -0,0 +1,186 @@ +import numpy as np +import math +import pdb +import tensorflow as tf +import answer_classifier.ans_data_io_helper as ans_io_helper +from tf_graph_creation_helper import weight_variable, bias_variable, \ + q_bin_embed_graph, conv2d, max_pool_2x2, batchnorm, explicit_feat_graph + + +class rel_graph_creator(): + def __init__(self, + plholder_dict, + keep_prob, + obj_feat, + atr_feat, + obj_prob, + atr_prob, + vocab_size, + batch_size, + graph_config, + mode='q_obj_atr', + is_train=True): + + self.mode = mode + self.is_train = plholder_dict['is_train'] + self.keep_prob = keep_prob + image_regions = plholder_dict['image_regions'] + + with tf.name_scope('rel') as rel_graph: + # Word Vectors + word_vecs = self.create_word_vecs(vocab_size, + graph_config['word_vec_dim']) + + # Feature Computations + q_feat = self.add_q_feat_comp_layer(word_vecs, plholder_dict) + reg_feat = self.add_reg_feat_comp_layer(image_regions) + explt_feat = self.add_explt_feat_comp_layer(obj_prob, + atr_prob, + plholder_dict) + + # Feature Projections (with batch norm) + feat_proj_dim = graph_config['joint_embed_dim'] + proj_feat = dict() + + proj_feat['q'] = self.feat_proj_layer(q_feat, feat_proj_dim, + 'q_feat_proj_layer') + + proj_feat['reg'] = self.feat_proj_layer(reg_feat, feat_proj_dim, + 'reg_feat_proj_layer') + + proj_feat['obj'] = self.feat_proj_layer(obj_feat, feat_proj_dim, + 'obj_feat_proj_layer') + + proj_feat['atr'] = self.feat_proj_layer(atr_feat, feat_proj_dim, + 'atr_feat_proj_layer') + + # Feature Combination + coeffs = self.mixing_coeffs() + print coeffs + num_regions = batch_size*ans_io_helper.num_proposals + comb_feat = tf.zeros(shape=[num_regions, feat_proj_dim], + dtype=tf.float32) + for feat_type, feat in proj_feat.items(): + comb_feat = comb_feat + feat * coeffs[feat_type] + + bn_comb_feat = batchnorm(comb_feat, None, self.is_train) + bn_explt_feat = batchnorm(explt_feat, None, self.is_train) + + # Softmax scores + final_feat = tf.concat(1, [bn_comb_feat, bn_explt_feat]) + self.rel_score = self.softmax_layer(tf.nn.relu(final_feat), + batch_size, + ans_io_helper.num_proposals) + + def create_word_vecs(self, vocab_size, word_vec_dim): + word_vecs = weight_variable([vocab_size, + word_vec_dim], + var_name='word_vecs') + word_vecs = tf.nn.l2_normalize(word_vecs, 1) + tf.add_to_collection('regularize',word_vecs) + return word_vecs + + def add_q_feat_comp_layer(self, word_vecs, plholder_dict): + with tf.name_scope('q_feat_comp_layer') as q_feat_comp_layer: + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, + [bin0_embed, bin1_embed, bin2_embed, bin3_embed], + name='q_feat') + return q_feat + + def add_reg_feat_comp_layer(self, image_regions): + with tf.name_scope('reg_feat_comp_layer') as reg_feat_comp_layer: + with tf.name_scope('conv1') as conv1: + W_conv1 = weight_variable([5,5,3,4]) + b_conv1 = bias_variable([4]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), + b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, self.keep_prob, + name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + W_conv2 = weight_variable([3,3,4,8]) + b_conv2 = bias_variable([8]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, self.keep_prob, + name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + reg_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in + h_pool2_drop_shape[1:]]) + reg_feat = tf.reshape(h_pool2_drop, [-1, reg_feat_dim], + name='reg_feat') + + tf.add_to_collection('regularize', W_conv1) + tf.add_to_collection('regularize', W_conv2) + + return reg_feat + + def add_explt_feat_comp_layer(self, obj_prob, atr_prob, plholder_dict): + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + return concat_explt_feat + + def feat_proj_layer(self, feat, proj_dim, name_scope): + with tf.name_scope(name_scope) as fc_layer: + feat_dim = feat.get_shape()[1].value + W1 = weight_variable([feat_dim, proj_dim]) + b1 = bias_variable([proj_dim]) + proj_feat1 = tf.add(tf.matmul(feat, W1), b1) + bn_proj_feat1 = batchnorm(proj_feat1, None, self.is_train) + W2 = weight_variable([proj_dim, proj_dim]) + b2 = bias_variable([proj_dim]) + bn_proj_feat2 = tf.add(tf.matmul(tf.nn.relu(bn_proj_feat1), W2), b2) + + tf.add_to_collection('regularize', W1) + tf.add_to_collection('regularize', W2) + + return bn_proj_feat2 + + def mixing_coeffs(self): + feat_types = ['q', 'obj', 'atr', 'reg'] + coeffs = dict() + count = 0; + for feat_type in feat_types: + if feat_type in self.mode: + coeffs[feat_type] = 1.0 + else: + coeffs[feat_type] = 0.0 + count += coeffs[feat_type] + coeffs = {k: v/count for k, v in coeffs.items()} + return coeffs + + def softmax_layer(self, feats, batch_size, num_proposals): + feat_dim = feats.get_shape()[1].value + with tf.name_scope('softmax_layer') as softmax_layer: + W = weight_variable([feat_dim, 1]) + b = bias_variable([1]) + + vec_logits = tf.add(tf.matmul(feats, W), b, + name='vec_logits') + + logits = tf.reshape(vec_logits, + [batch_size, num_proposals]) + + y_pred = tf.nn.softmax(logits, name='softmax') + + tf.add_to_collection('regularize', W) + + return y_pred diff --git a/classifiers/tf_graph_creation_helper.py b/classifiers/tf_graph_creation_helper.py index 34cebf6b2940461b9fc8ef708d7cd89ff03de10c..8419909c7346dbb95ffc309c540f2c0fe8481d2b 100644 --- a/classifiers/tf_graph_creation_helper.py +++ b/classifiers/tf_graph_creation_helper.py @@ -11,8 +11,10 @@ graph_config = { 'atr_feat_dim': 392, 'region_feat_dim': 392, #3136 'word_vec_dim': 50, + 'q_embed_dim': 200, 'ans_fc1_dim': 300, 'rel_fc1_dim': 100, + 'joint_embed_dim': 100, } def get_variable(var_scope): @@ -69,32 +71,73 @@ def placeholder_inputs(mode = 'gt'): def placeholder_inputs_rel(num_proposals, total_vocab_size, mode = 'gt'): - image_regions = tf.placeholder(tf.float32, shape=[None,25,25,3]) - keep_prob = tf.placeholder(tf.float32) - questions = tf.placeholder(tf.float32, shape=[None,total_vocab_size]) + plholder_dict = { + 'image_regions': tf.placeholder(tf.float32, [None,25,25,3], + 'image_regions'), + 'keep_prob': tf.placeholder(tf.float32, name='keep_prob'), + 'is_train': tf.placeholder(tf.bool, [], 'is_train'), + } + for i in xrange(4): + bin_name = 'bin' + str(i) + plholder_dict[bin_name + '_shape'] = \ + tf.placeholder(tf.int64, [2], bin_name + '_shape') + plholder_dict[bin_name + '_indices'] = \ + tf.placeholder(tf.int64, [None, 2], bin_name + '_indices') + plholder_dict[bin_name + '_values'] = \ + tf.placeholder(tf.int64, [None], bin_name + '_values') + plholder_dict[bin_name + '_obj_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_objects']], + bin_name + '_obj_cont') + plholder_dict[bin_name + '_atr_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_attributes']], + bin_name + '_atr_cont') if mode == 'gt': print 'Creating placeholder for ground truth' - y = tf.placeholder(tf.float32, - shape=[None, ans_io_helper.num_proposals]) - return (image_regions, questions, y, keep_prob) + plholder_dict['gt_scores'] = tf.placeholder(tf.float32,\ + shape=[None, ans_io_helper.num_proposals], name = 'gt_scores') + return plholder_dict if mode == 'no_gt': print 'No placeholder for ground truth' - return (image_regions, questions, keep_prob) + return plholder_dict def placeholder_inputs_ans(total_vocab_size, ans_vocab_size, mode='gt'): - image_regions = tf.placeholder(tf.float32, shape=[None,25,25,3]) - keep_prob = tf.placeholder(tf.float32) - questions = tf.placeholder(tf.float32, shape=[None,total_vocab_size]) - region_score = tf.placeholder(tf.float32, shape=[1,None]) - + plholder_dict = { + 'image_regions': tf.placeholder(tf.float32, [None,25,25,3], + 'image_regions'), + 'keep_prob': tf.placeholder(tf.float32, name='keep_prob'), + 'questions': tf.placeholder(tf.float32, [None,total_vocab_size], + 'questions'), + 'region_score': tf.placeholder(tf.float32, [1,None], + 'region_score'), + + 'is_train': tf.placeholder(tf.bool, [], 'is_train') + } + for i in xrange(4): + bin_name = 'bin' + str(i) + plholder_dict[bin_name + '_shape'] = \ + tf.placeholder(tf.int64, [2], bin_name + '_shape') + plholder_dict[bin_name + '_indices'] = \ + tf.placeholder(tf.int64, [None, 2], bin_name + '_indices') + plholder_dict[bin_name + '_values'] = \ + tf.placeholder(tf.int64, [None], bin_name + '_values') + plholder_dict[bin_name + '_obj_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_objects']], + bin_name + '_obj_cont') + plholder_dict[bin_name + '_atr_cont'] = \ + tf.placeholder(tf.float32, [None, graph_config['num_attributes']], + bin_name + '_atr_cont') + if mode == 'gt': print 'Creating placeholder for ground truth' - gt_answer = tf.placeholder(tf.float32, shape=[None, ans_vocab_size]) - return (image_regions, questions, keep_prob, gt_answer, region_score) + plholder_dict['gt_answer'] = tf.placeholder(tf.float32, + shape=[None, + ans_vocab_size], + name = 'gt_answer') + return plholder_dict if mode == 'no_gt': print 'No placeholder for ground truth' - return (image_regions, questions, keep_prob, region_score) + return plholder_dict def obj_comp_graph(x, keep_prob): @@ -181,16 +224,58 @@ def atr_comp_graph(x, keep_prob, obj_feat): return y_pred -def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, - mode, keep_prob, vocab_size, batch_size): +def q_bin_embed_graph(bin_name, word_vecs, plholder_dict): + indices = plholder_dict[bin_name + '_indices'] + values = plholder_dict[bin_name + '_values'] + shape = plholder_dict[bin_name + '_shape'] + sp_ids = tf.SparseTensor(indices, values, shape) + return tf.nn.embedding_lookup_sparse(word_vecs, sp_ids, None, + name=bin_name + '_embedding') + + +def explicit_feat_graph(bin_name, classifier_prob, + classifier_type, plholder_dict): + cont_plholder_name = bin_name + '_' + classifier_type + '_cont' + feat_name = 'explt_' + bin_name + '_' + classifier_type + dot_product = tf.mul(classifier_prob, plholder_dict[cont_plholder_name]) + return tf.reduce_mean(dot_product, 1, keep_dims=True, name=feat_name) + + +def rel_comp_graph(plholder_dict, obj_feat, atr_feat, + obj_prob, atr_prob, mode, keep_prob, + vocab_size, batch_size): + image_regions = plholder_dict['image_regions'] with tf.name_scope('rel') as rel_graph: with tf.name_scope('word_embed') as q_embed: word_vecs = weight_variable([vocab_size, graph_config['word_vec_dim']], var_name='word_vecs') - q_feat = tf.matmul(questions, word_vecs, name='q_feat') + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, [bin0_embed, + bin1_embed, + bin2_embed, + bin3_embed], name='q_feat') + + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + concat_explt_feat_dim = concat_explt_feat.get_shape()[1].value + print('Concatenate explicit feature dimension: ' + \ + str(concat_explt_feat_dim)) with tf.name_scope('conv1') as conv1: W_conv1 = weight_variable([5,5,3,4]) @@ -225,47 +310,58 @@ def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, print 'Atr feat dim: {}'.format(atr_feat_dim) W_reg_fc1 = weight_variable([reg_feat_dim, fc1_dim], var_name='W_reg') - W_q_fc1 = weight_variable([graph_config['word_vec_dim'], + W_q_fc1 = weight_variable([graph_config['q_embed_dim'], fc1_dim], var_name='W_q') W_obj_fc1 = weight_variable([obj_feat_dim, fc1_dim], var_name='W_obj') W_atr_fc1 = weight_variable([atr_feat_dim, fc1_dim], var_name='W_atr') + W_explt_fc1 = weight_variable([concat_explt_feat_dim, + fc1_dim], var_name='W_explt') b_fc1 = bias_variable([fc1_dim]) a_reg_fc1 = tf.matmul(reg_feat, W_reg_fc1, name='a_reg_fc1') a_q_fc1 = tf.matmul(q_feat, W_q_fc1, name='a_q_fc1') a_obj_fc1 = tf.matmul(obj_feat, W_obj_fc1, name='a_obj_fc1') a_atr_fc1 = tf.matmul(atr_feat, W_atr_fc1, name='a_atr_fc1') - + a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, + name='a_explt_fc1') + coeff = { 'reg': 0.0, 'q': 0.0, 'obj': 0.0, 'atr': 0.0, + 'explt': 0.0, } - if mode=='q_reg': + if mode=='q_reg_explt': print mode - coeff['reg'] = 1/2.0 - coeff['q'] = 1/2.0 + coeff['reg'] = 1/3.0 + coeff['q'] = 1/3.0 + coeff['explt'] = 1/3.0 - elif mode=='q_obj_atr': + elif mode=='q_obj_atr_explt': print mode - coeff['q'] = 1/3.0 - coeff['obj'] = 1/3.0 - coeff['atr'] = 1/3.0 + coeff['q'] = 0.1 + coeff['obj'] = 0.1 + coeff['atr'] = 0.1 + coeff['explt'] = 0.7 - elif mode=='q_obj_atr_reg': + elif mode=='q_obj_atr_reg_explt': print mode - coeff['q'] = 1/4.0 - coeff['obj'] = 1/4.0 - coeff['atr'] = 1/4.0 - coeff['reg'] = 1/4.0 + coeff['q'] = 0.05 + coeff['obj'] = 0.05 + coeff['atr'] = 0.05 + coeff['reg'] = 0.05 + coeff['explt'] = 0.8 + + elif mode=='explt': + coeff['explt'] = 1.0 a_fc1 = coeff['reg']*a_reg_fc1 + coeff['q']*a_q_fc1 + \ coeff['obj']*a_obj_fc1 + coeff['atr']*a_atr_fc1 + \ - b_fc1 + coeff['explt']*a_explt_fc1 + b_fc1 h_fc1 = tf.nn.relu(a_fc1, name='h') h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='h_drop') @@ -283,11 +379,14 @@ def rel_comp_graph(image_regions, questions, obj_feat, atr_feat, y_pred = tf.nn.softmax(logits, name='softmax') return y_pred + - -def ans_comp_graph(image_regions, questions, keep_prob, obj_feat, atr_feat, +def ans_comp_graph(plholder_dict, obj_feat, atr_feat, vocab, inv_vocab, ans_vocab_size, mode): vocab_size = len(vocab) + image_regions = plholder_dict['image_regions'] + keep_prob = plholder_dict['keep_prob'] + with tf.name_scope('ans') as ans_graph: with tf.name_scope('word_embed') as word_embed: @@ -295,8 +394,16 @@ def ans_comp_graph(image_regions, questions, keep_prob, obj_feat, atr_feat, word_vecs = weight_variable([vocab_size, graph_config['word_vec_dim']], var_name='word_vecs') - q_feat = tf.matmul(questions, word_vecs, name='q_feat') - + + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, [bin0_embed, + bin1_embed, + bin2_embed, + bin3_embed], name='q_feat') + with tf.name_scope('conv1') as conv1: num_filters_conv1 = 4 W_conv1 = weight_variable([5,5,3,num_filters_conv1]) @@ -331,7 +438,7 @@ def ans_comp_graph(image_regions, questions, keep_prob, obj_feat, atr_feat, fc1_dim], var_name='W_obj') W_atr_fc1 = weight_variable([graph_config['atr_feat_dim'], fc1_dim], var_name='W_atr') - W_q_fc1 = weight_variable([graph_config['word_vec_dim'], + W_q_fc1 = weight_variable([graph_config['q_embed_dim'], fc1_dim], var_name='W_q') b_fc1 = bias_variable([fc1_dim]) @@ -383,6 +490,173 @@ def ans_comp_graph(image_regions, questions, keep_prob, obj_feat, atr_feat, return y_pred +def ans_comp_margin_graph(plholder_dict, obj_feat, atr_feat, obj_prob, atr_prob, + vocab, inv_vocab, ans_vocab, mode, train): + vocab_size = len(vocab) + image_regions = plholder_dict['image_regions'] + keep_prob = plholder_dict['keep_prob'] + ans_vocab_size = len(ans_vocab) + + inv_ans_vocab = {v:k for k, v in ans_vocab.items()} + ans_in_vocab_ids_list = [] + for i in xrange(ans_vocab_size): + ans_in_vocab_ids_list.append(vocab[inv_ans_vocab[i]]) + + ans_in_vocab_ids_tensor = tf.constant(ans_in_vocab_ids_list, dtype=tf.int64) + + with tf.name_scope('ans') as ans_graph: + + with tf.name_scope('word_embed') as word_embed: + + word_vecs = weight_variable([vocab_size, + graph_config['word_vec_dim']], + var_name='word_vecs') + + bin0_embed = q_bin_embed_graph('bin0', word_vecs, plholder_dict) + bin1_embed = q_bin_embed_graph('bin1', word_vecs, plholder_dict) + bin2_embed = q_bin_embed_graph('bin2', word_vecs, plholder_dict) + bin3_embed = q_bin_embed_graph('bin3', word_vecs, plholder_dict) + q_feat = tf.concat(1, [bin0_embed, + bin1_embed, + bin2_embed, + bin3_embed], name='q_feat') + + ans_embed = tf.nn.embedding_lookup(word_vecs, ans_in_vocab_ids_list, + name='ans_embed') + + with tf.name_scope('explicit_feat') as expl_feat: + explt_feat_list = [] + for bin_num in xrange(4): + bin_name = 'bin'+ str(bin_num) + explt_feat_list.append(explicit_feat_graph(bin_name, obj_prob, + 'obj', plholder_dict)) + explt_feat_list.append(explicit_feat_graph(bin_name, atr_prob, + 'atr', plholder_dict)) + + concat_explt_feat = tf.concat(1, explt_feat_list, + name = 'concat_explt_feat') + + concat_explt_feat_dim = concat_explt_feat.get_shape()[1].value + print('Concatenate explicit feature dimension: ' + \ + str(concat_explt_feat_dim)) + + with tf.name_scope('conv1') as conv1: + num_filters_conv1 = 4 + W_conv1 = weight_variable([5,5,3,num_filters_conv1]) + b_conv1 = bias_variable([num_filters_conv1]) + a_conv1 = tf.add(conv2d(image_regions, W_conv1), b_conv1, name='a') + h_conv1 = tf.nn.relu(a_conv1, name='h') + h_pool1 = max_pool_2x2(h_conv1) + h_conv1_drop = tf.nn.dropout(h_pool1, keep_prob, name='h_pool_drop') + + with tf.name_scope('conv2') as conv2: + num_filters_conv2 = 8 + W_conv2 = weight_variable([3,3,num_filters_conv1,num_filters_conv2]) + b_conv2 = bias_variable([num_filters_conv2]) + a_conv2 = tf.add(conv2d(h_pool1, W_conv2), b_conv2, name='a') + h_conv2 = tf.nn.relu(a_conv2, name='h') + h_pool2 = max_pool_2x2(h_conv2) + h_pool2_drop = tf.nn.dropout(h_pool2, keep_prob, name='h_pool_drop') + h_pool2_drop_shape = h_pool2_drop.get_shape() + region_feat_dim = reduce(lambda f, g: f*g, + [dim.value for dim in h_pool2_drop_shape[1:]]) + region_feat = tf.reshape(h_pool2_drop, [-1, region_feat_dim], + name='region_feat') + + print('Region feature dimension: ' + str(region_feat_dim)) #392 + + with tf.name_scope('fc1') as fc1: + + fc1_dim = graph_config['ans_fc1_dim'] + W_region_fc1 = weight_variable([region_feat_dim, + fc1_dim], var_name='W_region') + W_obj_fc1 = weight_variable([graph_config['obj_feat_dim'], + fc1_dim], var_name='W_obj') + W_atr_fc1 = weight_variable([graph_config['atr_feat_dim'], + fc1_dim], var_name='W_atr') + W_q_fc1 = weight_variable([graph_config['q_embed_dim'], + fc1_dim], var_name='W_q') + W_explt_fc1 = weight_variable([concat_explt_feat_dim, + fc1_dim], var_name='W_explt') + b_fc1 = bias_variable([fc1_dim]) + + a_fc1_region = tf.matmul(region_feat, W_region_fc1, + name='a_fc1_region') + a_fc1_obj = tf.matmul(obj_feat, W_obj_fc1, name='a_fc1_obj') + a_fc1_atr = tf.matmul(atr_feat, W_atr_fc1, name='a_fc1_atr') + a_fc1_q = tf.matmul(q_feat, W_q_fc1, name='a_fc1_q') + a_explt_fc1 = tf.matmul(concat_explt_feat, W_explt_fc1, + name='a_explt_fc1') + + a_fc1_region = batchnorm(a_fc1_region, 'reg', train) + a_fc1_obj = batchnorm(a_fc1_obj, 'obj', train) + a_fc1_atr = batchnorm(a_fc1_atr, 'atr', train) + a_fc1_q = batchnorm(a_fc1_q, 'q', train) + a_explt_fc1 = batchnorm(a_explt_fc1, 'explt', train) + + coeff_reg = 0.0 + coeff_obj = 0.0 + coeff_atr = 0.0 + coeff_q = 0.0 + coeff_explt = 0.0 + + if mode=='q': + coeff_q = 1.0 + + elif mode=='q_reg': + coeff_q = 1/2.0 + coeff_reg = 1/2.0 + + elif mode=='q_obj_atr': + coeff_q = 1/4.0 + coeff_obj = 1/4.0 + coeff_atr = 1/4.0 + coeff_explt = 1/4.0 + + elif mode=='q_obj_atr_reg': + coeff_q = 1/5.0 + coeff_obj = 1/5.0 + coeff_atr = 1/5.0 + coeff_reg = 1/5.0 + coeff_explt = 1/5.0 + + a_fc1 = coeff_reg * a_fc1_region + \ + coeff_obj * a_fc1_obj + \ + coeff_atr * a_fc1_atr + \ + coeff_q * a_fc1_q + \ + coeff_explt * a_explt_fc1 + + h_fc1 = tf.nn.relu(a_fc1, name='h') + h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='h_drop') + + with tf.name_scope('fc2') as fc2: + W_feat_fc2 = weight_variable([fc1_dim, + graph_config['word_vec_dim']], + var_name='W_feat') + b_feat_fc2 = bias_variable([graph_config['word_vec_dim']], + var_name='b_feat') + W_ans_fc2 = weight_variable([graph_config['word_vec_dim'], + graph_config['word_vec_dim']], + var_name='W_ans') + b_ans_fc2 = bias_variable([graph_config['word_vec_dim']], + var_name='b_ans') + comb_feat_embed = tf.add(tf.matmul(h_fc1_drop, W_feat_fc2), + b_feat_fc2, + name='comb_feat_embed') + comb_ans_embed = tf.add(tf.matmul(ans_embed, W_ans_fc2), + b_ans_fc2, + name='comb_feat_embed') + comb_feat_embed = batchnorm(comb_feat_embed, 'feat_embed', train) + comb_ans_embed = batchnorm(comb_ans_embed, 'ans_embed', train) + comb_feat_embed = tf.nn.l2_normalize(comb_feat_embed, 1) + comb_ans_embed = tf.nn.l2_normalize(comb_ans_embed,1) + ans_scores = tf.matmul(comb_feat_embed, tf.transpose(comb_ans_embed), + name='ans_scores') + #ans_scores = tf.nn.l2_normalize(ans_scores, 1)*3.0 + return tf.nn.softmax(ans_scores) + + + def aggregate_y_pred(y_pred, region_score, batch_size, num_proposals, ans_vocab_size): y_pred_list = tf.split(0, batch_size, y_pred) @@ -410,6 +684,12 @@ def loss(y, y_pred): return tf.truediv(cross_entropy, tf.cast(batch_size[0],tf.float32)) +def margin_loss(y, y_pred, margin): + correct_score = tf.reduce_sum(tf.mul(y, y_pred), 1, + keep_dims=True, name='correct_score') + return tf.reduce_mean(tf.maximum(0.0, y_pred + margin - correct_score)) + + def regularize_params(param_list): regularizer = tf.zeros(shape=[]) for param in param_list: @@ -417,6 +697,40 @@ def regularize_params(param_list): return regularizer +def batchnorm(input, suffix, is_train, decay=0.95, epsilon=1e-4, name='bn'): + rank = len(input.get_shape().as_list()) + in_dim = input.get_shape().as_list()[-1] + + if rank == 2: + axes = [0] + elif rank == 4: + axes = [0, 1, 2] + else: + raise ValueError('Input tensor must have rank 2 or 4.') + + if suffix: + suffix = '_' + suffix + else: + suffix = '' + + mean, variance = tf.nn.moments(input, axes) + offset = tf.Variable(initial_value=tf.constant(value=0.0, shape=[in_dim]), + name='offset' + suffix) + scale = tf.Variable(initial_value=tf.constant(value=1.0, shape=[in_dim]), + name='scale' + suffix) + + ema = tf.train.ExponentialMovingAverage(decay=decay) + ema_apply_op = ema.apply([mean, variance]) + ema_mean, ema_var = ema.average(mean), ema.average(variance) + + with tf.control_dependencies([ema_apply_op]): + bn_train = tf.nn.batch_normalization(input, mean, variance, + offset, scale, epsilon, name) + bn_test = tf.nn.batch_normalization(input, ema_mean, ema_var, + offset, scale, epsilon, name) + return tf.cond(is_train, lambda : bn_train, lambda : bn_test) + + if __name__ == '__main__': lg_dir = '/home/tanmay/Code/GenVQA/Exp_Results/lg_files/' diff --git a/classifiers/train_classifiers.py b/classifiers/train_classifiers.py index 0e9089ee3b504003d40e7314e2b65e032a24a765..59b160222106bf996c82ef429156183dfcefe1e9 100644 --- a/classifiers/train_classifiers.py +++ b/classifiers/train_classifiers.py @@ -1,3 +1,4 @@ + import sys import json import os @@ -9,9 +10,12 @@ import object_classifiers.train_obj_classifier as obj_trainer import object_classifiers.eval_obj_classifier as obj_evaluator import attribute_classifiers.train_atr_classifier as atr_trainer import attribute_classifiers.eval_atr_classifier as atr_evaluator -import answer_classifier.train_ans_classifier as ans_trainer -import region_ranker.train_rel_classifier as rel_trainer -import region_ranker.eval_rel_classifier as rel_evaluator +#import answer_classifier.train_ans_classifier as ans_trainer +import answer_classifier.train_ans_classifier_simple as ans_trainer +#import region_ranker.train_rel_classifier as rel_trainer +import region_ranker.train_rel_classifier_simple as rel_trainer +#import region_ranker.eval_rel_classifier as rel_evaluator +import region_ranker.eval_rel_classifier_simple as rel_evaluator workflow = { 'train_obj': False, @@ -19,8 +23,8 @@ workflow = { 'train_atr': False, 'eval_atr': False, 'train_rel': False, - 'eval_rel': False, - 'train_ans': True, + 'eval_rel': True, + 'train_ans': False, } obj_classifier_train_params = { @@ -65,29 +69,31 @@ rel_classifier_train_params = { 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt_At_End', 'obj_atr_model': '/home/tanmay/Code/GenVQA/Exp_Results/Atr_Classifier/obj_atr_classifier-1', - 'mode': 'q_obj_atr', + 'mode': 'q_obj_atr_reg_explt', 'adam_lr' : 0.001, 'crop_n_save_regions': False, 'max_epoch': 5, 'batch_size': 10, 'fine_tune': False, - 'start_model': 4, # Used only if fine_tune is True + 'start_model': 0, # Used only if fine_tune is True } rel_classifier_eval_params = { 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', - 'model_basedir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Prob', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt_At_End', + 'model_basedir': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt_At_End', 'model_number': 4, - 'mode': 'q_obj_atr', + 'mode': 'q_obj_atr_reg_explt', 'batch_size': 20, 'test_start_id': 94645, 'test_set_size': 143495-94645+1, @@ -97,18 +103,19 @@ ans_classifier_train_params = { 'train_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/train_anno.json', 'test_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/test_anno.json', 'regions_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/regions_anno.json', + 'parsed_q_json': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/parsed_questions.json', 'image_dir': '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images', 'image_regions_dir': '/mnt/ramdisk/image_regions', - 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_w_Rel', - 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier/rel_classifier_q_obj_atr-4', + 'outdir': '/home/tanmay/Code/GenVQA/Exp_Results/Ans_Classifier_Margin', + 'rel_model': '/home/tanmay/Code/GenVQA/Exp_Results/Rel_Classifier_Obj_Atr_Explt_At_End/rel_classifier_q_obj_atr_reg_explt-4', 'obj_atr_model': '/home/tanmay/Code/GenVQA/Exp_Results/Atr_Classifier/obj_atr_classifier-1', - 'adam_lr' : 0.0001, - 'mode' : 'q_obj_atr', + 'adam_lr' : 0.001, + 'mode' : 'q_obj_atr_reg', 'crop_n_save_regions': False, - 'max_epoch': 10, + 'max_epoch': 5, 'batch_size': 10, - 'fine_tune': True, - 'start_model': 4, + 'fine_tune': False, + 'start_model': 0, # When fine_tune is false used to pre-initialize q_obj_atr with q model etc } if __name__=='__main__':