diff --git a/classifiers/answer_classifier/ans_data_io_helper.py b/classifiers/answer_classifier/ans_data_io_helper.py index 71b37600a5c256b8b357b51fe2908eac575f0891..58988558996a7751a51c370b4ba0cc3cc1c21592 100644 --- a/classifiers/answer_classifier/ans_data_io_helper.py +++ b/classifiers/answer_classifier/ans_data_io_helper.py @@ -12,6 +12,9 @@ import region_ranker.perfect_ranker as region_proposer qa_tuple = namedtuple('qa_tuple','image_id question answer') +num_proposals = 22 +region_coords = region_proposer.get_region_coords() + def create_ans_dict(): ans_dict = { @@ -67,39 +70,57 @@ def ans_mini_batch_loader(qa_dict, region_anno_dict, ans_dict, vocab, image_dir, mean_image, start_index, batch_size, img_height=100, img_width=100, channels = 3): - # compute the number of proposals - count = 0; + ans_labels = np.zeros(shape=[batch_size, len(ans_dict)]) for i in xrange(start_index, start_index + batch_size): - count = count + len(region_anno_dict[qa_dict[i].image_id]) + answer = qa_dict[i].answer + ans_labels[i-start_index, ans_dict[answer]] = 1 + + # number of regions in the batch + count = batch_size*num_proposals; - region_images = np.empty(shape=[count, img_height, + region_images = np.zeros(shape=[count, img_height, img_width, channels]) - - ans_labels = np.zeros(shape=[count, len(ans_dict)]) + region_score = np.zeros(shape=[1,count]) + partition = np.zeros(shape=[count]) question_encodings = np.zeros(shape=[count, len(vocab)]) - - counter = 0 + for i in xrange(start_index, start_index + batch_size): image_id = qa_dict[i].image_id question = qa_dict[i].question answer = qa_dict[i].answer - region_coords = region_anno_dict[image_id] - image = mpimg.imread(os.path.join(image_dir, str(image_id) + '.jpg')) - regions = region_proposer.rank_regions(image, question, region_coords) - for _, proposal in regions.items(): - resized_region = misc.imresize(proposal.image, \ + gt_regions_for_image = region_anno_dict[image_id] + image = mpimg.imread(os.path.join(image_dir, + str(image_id) + '.jpg')) + regions = region_proposer.rank_regions(image, question, region_coords, + gt_regions_for_image) + for j in xrange(num_proposals): + counter = j + (i-start_index)*num_proposals + resized_region = misc.imresize(regions[j].image, \ (img_height, img_width)) - region_images[counter,:,:,:] = (resized_region / 254.0) - mean_image - ans_labels[counter, ans_dict[answer]] = 1 + region_images[counter,:,:,:] = (resized_region / 254.0) \ + - mean_image + region_score[0,counter] = regions[j].score + partition[counter] = i-start_index for word in question[0:-1].split(): if word not in vocab: word = 'unk' question_encodings[counter, vocab[word]] += 1 - counter = counter + 1 + # Check for nans, infs + assert (not np.any(np.isnan(region_images))), "NaN in region_images" + assert (not np.any(np.isnan(ans_labels))), "NaN in labels" + assert (not np.any(np.isnan(question_encodings))), "NaN in question_encodings" + assert (not np.any(np.isnan(region_score))), "NaN in region_score" + assert (not np.any(np.isnan(partition))), "NaN in partition" + + assert (not np.any(np.isinf(region_images))), "Inf in region_images" + assert (not np.any(np.isinf(ans_labels))), "Inf in labels" + assert (not np.any(np.isinf(question_encodings))), "Inf in question_encodings" + assert (not np.any(np.isinf(region_score))), "Inf in region_score" + assert (not np.any(np.isinf(partition))), "Inf in partition" - return region_images, ans_labels, question_encodings + return region_images, ans_labels, question_encodings, region_score, partition if __name__=='__main__': @@ -112,6 +133,8 @@ if __name__=='__main__': image_dir = '/home/tanmay/Code/GenVQA/GenVQA/' + \ 'shapes_dataset/images' + mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ + 'Obj_Classifier/mean_image.npy') qa_anno_dict = parse_qa_anno(train_anno_filename) region_anno_dict = region_proposer.parse_region_anno(region_anno_filename) @@ -119,11 +142,13 @@ if __name__=='__main__': vocab, _ = get_vocab(qa_anno_dict) - region_images, ans_labels, question_encodings = \ + region_images, ans_labels, question_encodings, score, partition = \ ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_dict, vocab, - image_dir, None, 1, 2, 25, 25, 3) + image_dir, mean_image, 1, 1, 25, 25, 3) print(ans_labels.shape) print(question_encodings.shape) print(region_images.shape) + print(score) + print(partition) diff --git a/classifiers/answer_classifier/ans_data_io_helper.pyc b/classifiers/answer_classifier/ans_data_io_helper.pyc index d03abbe8c016b6f922bcc5fb29c71faa48cdc2b8..6eda2294ef5263df1f1e80c298cf9d93111a0b99 100644 Binary files a/classifiers/answer_classifier/ans_data_io_helper.pyc and b/classifiers/answer_classifier/ans_data_io_helper.pyc differ diff --git a/classifiers/answer_classifier/train_ans_classifier.py b/classifiers/answer_classifier/train_ans_classifier.py index 10d7f9a3e7592207a3bf971b547fbd80cc70b9b8..6d27300500b1ab60033e7fd188fd3141331e217b 100644 --- a/classifiers/answer_classifier/train_ans_classifier.py +++ b/classifiers/answer_classifier/train_ans_classifier.py @@ -12,6 +12,39 @@ import plot_helper as plotter import ans_data_io_helper as ans_io_helper import region_ranker.perfect_ranker as region_proposer +val_start_id = 106115 +val_batch_size = 1000 + +batch_size = 10 + +def evaluate(accuracy, qa_anno_dict, region_anno_dict, ans_vocab, vocab, + image_dir, mean_image, start_index, batch_size, + placeholders, img_height=100, img_width=100): + + correct = 0 + for i in xrange(start_index, start_index + batch_size): + region_images, ans_labels, questions, \ + region_score, partition= \ + ans_io_helper.ans_mini_batch_loader(qa_anno_dict, + region_anno_dict, + ans_vocab, vocab, + image_dir, mean_image, + i, 1, + img_height, img_width, 3) + + feed_dict = { + placeholders[0] : region_images, + placeholders[1] : questions, + placeholders[2] : 1.0, + placeholders[3] : ans_labels, + placeholders[4] : region_score, + } + + correct = correct + accuracy.eval(feed_dict) + + return correct/batch_size + + def train(train_params): sess = tf.InteractiveSession() @@ -34,7 +67,7 @@ def train(train_params): vocab, inv_vocab = ans_io_helper.get_vocab(qa_anno_dict) # Create graph - image_regions, questions, keep_prob, y = \ + image_regions, questions, keep_prob, y, region_score= \ graph_creator.placeholder_inputs_ans(len(vocab), len(ans_vocab), mode='gt') y_pred_obj = graph_creator.obj_comp_graph(image_regions, keep_prob) @@ -49,13 +82,19 @@ def train(train_params): obj_atr_saver.restore(sess, model_to_restore) y_pred = graph_creator.ans_comp_graph(image_regions, questions, keep_prob, \ - obj_feat[0], atr_feat[0], - vocab, inv_vocab, len(ans_vocab)) - cross_entropy = graph_creator.loss(y, y_pred) - accuracy = graph_creator.evaluation(y, y_pred) + obj_feat[0], atr_feat[0], + vocab, inv_vocab, len(ans_vocab)) + y_avg = graph_creator.aggregate_y_pred(y_pred, region_score, batch_size, + ans_io_helper.num_proposals, + len(ans_vocab)) +# y_avg = tf.matmul(region_score,y_pred) + + cross_entropy = graph_creator.loss(y, y_avg) + accuracy = graph_creator.evaluation(y, y_avg) # Collect variables vars_to_opt = tf.get_collection(tf.GraphKeys.VARIABLES, scope='ans') + train_step = tf.train.AdamOptimizer(train_params['adam_lr']) \ .minimize(cross_entropy, var_list=vars_to_opt) @@ -78,56 +117,72 @@ def train(train_params): mean_image = np.load('/home/tanmay/Code/GenVQA/Exp_Results/' + \ 'Obj_Classifier/mean_image.npy') - # Val data - val_region_images, val_ans_labels, val_questions = \ - ans_io_helper.ans_mini_batch_loader(qa_anno_dict, region_anno_dict, - ans_vocab, vocab, image_dir, - mean_image, 9501, 499, - 25, 25, 3) - feed_dict_val = { - image_regions : val_region_images, - questions: val_questions, - keep_prob: 1.0, - y: val_ans_labels, - } - + placeholders = [image_regions, questions, keep_prob, y, region_score] # Start Training - batch_size = 10 +# batch_size = 1 max_epoch = 10 - max_iter = 950 + max_iter = 9500 val_acc_array_epoch = np.zeros([max_epoch]) train_acc_array_epoch = np.zeros([max_epoch]) for epoch in range(max_epoch): for i in range(max_iter): if i%100==0: print('Iter: ' + str(i)) - print('Val Acc: ' + str(accuracy.eval(feed_dict_val))) - train_region_images, train_ans_labels, train_questions = \ + # val_accuracy = evaluate(accuracy, qa_anno_dict, + # region_anno_dict, ans_vocab, vocab, + # image_dir, mean_image, + # val_start_id, val_batch_size, + # placeholders, 25, 25) + # print(val_accuracy) + + train_region_images, train_ans_labels, train_questions, \ + train_region_score, train_partition= \ ans_io_helper.ans_mini_batch_loader(qa_anno_dict, region_anno_dict, ans_vocab, vocab, image_dir, mean_image, 1+i*batch_size, batch_size, 25, 25, 3) + + feed_dict_train = { image_regions : train_region_images, questions: train_questions, - keep_prob: 1.0, + keep_prob: 0.5, y: train_ans_labels, + region_score: train_region_score, } - _, current_train_batch_acc = sess.run([train_step, accuracy], - feed_dict=feed_dict_train) + + tf.shape(y_pred) + + q_feat = tf.get_collection('q_feat', scope='ans/q_embed') + _,current_train_batch_acc,q_feat_eval = \ + sess.run([train_step, accuracy, q_feat[0]], + feed_dict=feed_dict_train) + +# print(q_feat_eval) + # print(q_feat_eval.shape) +# print(i) +# print(train_questions) +# print(train_ans_labels) +# print(train_region_score) + train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] + \ current_train_batch_acc train_acc_array_epoch[epoch] = train_acc_array_epoch[epoch] / max_iter - val_acc_array_epoch[epoch] = accuracy.eval(feed_dict_val) - plotter.plot_accuracies(xdata=np.arange(0, epoch + 1) + 1, - ydata_train=train_acc_array_epoch[0:epoch + 1], - ydata_val=val_acc_array_epoch[0:epoch + 1], - xlim=[1, max_epoch], ylim=[0, 1.0], - savePath=os.path.join(outdir, - 'acc_vs_epoch.pdf')) + # val_accuracy = evaluate(accuracy, qa_anno_dict, + # region_anno_dict, ans_vocab, vocab, + # image_dir, mean_image, 9501, 499, + # placeholders, 25, 25) + # val_acc_array_epoch[epoch] = val_accuracy + # print(val_accuracy) + # plotter.plot_accuracies(xdata=np.arange(0, epoch + 1) + 1, + # ydata_train=train_acc_array_epoch[0:epoch + 1], + # ydata_val=val_acc_array_epoch[0:epoch + 1], + # xlim=[1, max_epoch], ylim=[0, 1.0], + # savePath=os.path.join(outdir, + # 'acc_vs_epoch.pdf')) save_path = saver.save(sess, os.path.join(outdir, 'ans_classifier'), global_step=epoch) diff --git a/classifiers/region_ranker/perfect_ranker.py b/classifiers/region_ranker/perfect_ranker.py index 737dfad7b74c7a382538704642ad270b2eb098f1..a26d8e159d3a3c0611a5d32937e39cc39a943acf 100644 --- a/classifiers/region_ranker/perfect_ranker.py +++ b/classifiers/region_ranker/perfect_ranker.py @@ -7,6 +7,7 @@ import matplotlib.image as mpimg from scipy import misc region = namedtuple('region','image score coord') + def parse_region_anno(json_filename): with open(json_filename,'r') as json_file: raw_data = json.load(json_file) @@ -16,24 +17,66 @@ def parse_region_anno(json_filename): region_anno_dict[entry['image_id']] = entry['regions'] return region_anno_dict - - -def rank_regions(image, question, region_coords): - regions = dict() - count = 1; - for key in region_coords: - x1, y1, x2, y2 = region_coords[key] - cropped_image = image[y1-1:y2, x1-1:x2, :] - if key in question: - score = 1 - else: - score = 0 +def get_region_coords(): + region_coords = np.array([[ 1, 1, 100, 100], + [ 101, 1, 200, 100], + [ 201, 1, 300, 100], + [ 1, 101, 100, 200], + [ 101, 101, 200, 200], + [ 201, 101, 300, 200], + [ 1, 201, 100, 300], + [ 101, 201, 200, 300], + [ 201, 201, 300, 300], + [ 1, 1, 100, 200], + [ 101, 1, 200, 200], + [ 201, 1, 300, 200], + [ 1, 101, 100, 300], + [ 101, 101, 200, 300], + [ 201, 101, 300, 300], + [ 1, 1, 200, 100], + [ 101, 1, 300, 100], + [ 1, 101, 200, 200], + [ 101, 101, 300, 200], + [ 1, 201, 200, 300], + [ 101, 201, 300, 300], + [ 1, 1, 300, 300]]) + return region_coords + +def rank_regions(image, question, region_coords, gt_regions_for_image): + + num_regions, _ = region_coords.shape + regions = dict() + + count = 0; + no_region_flag = True + for i in xrange(num_regions): + x1 = region_coords[i,0] + y1 = region_coords[i,1] + x2 = region_coords[i,2] + y2 = region_coords[i,3] + + cropped_image = image[y1-1:y2, x1-1:x2, :] + score = 0 + + for gt_region in gt_regions_for_image: + x1_, y1_, x2_, y2_ = gt_regions_for_image[gt_region] + if x1==x1_ and x2==x2_ and y1==y1_ and y2==y2_: + score = 1 + no_region_flag = False + break + regions[count] = region(image=cropped_image, score=score, - coord=region_coords[key]) + coord=region_coords[i,:]) count = count + 1 + if no_region_flag==True: + for i in xrange(num_regions): + regions[i] = region(image=regions[i].image, score=1.0/num_regions, + coord=regions[i].coord) + + return regions @@ -41,13 +84,25 @@ if __name__=='__main__': image_dir = '/home/tanmay/Code/GenVQA/GenVQA/shapes_dataset/images/' json_filename = os.path.join('/home/tanmay/Code/GenVQA/GenVQA/', - 'shapes_dataset/regions_anno.json') - region_anno_dict = parse_region_anno(json_filename) + 'shapes_dataset/train_anno.json') + # region_anno_dict = parse_region_anno(json_filename) - image_id = 1 - question = 'Is there a blue triangle?' - region_coords = region_anno_dict[image_id] - image = mpimg.imread(os.path.join(image_dir, str(image_id) + '.jpg')) - regions = rank_regions(image, question, region_coords) - print(regions) + # image_id = 1 + # question = 'Is there a blue triangle?' + # region_coords = region_anno_dict[image_id] + # image = mpimg.imread(os.path.join(image_dir, str(image_id) + '.jpg')) + # regions = rank_regions(image, question, region_coords) + # print(regions) +# count = 0 + # for i in xrange(14999): + # count = count + len(region_anno_dict[i+1]) + # print(count) + # print(len(region_anno_dict[1])) + with open(json_filename,'r') as json_file: + raw_data = json.load(json_file) + + + for key in raw_data: + if key['image_id']==9999: + print(key['question_id']) diff --git a/classifiers/region_ranker/perfect_ranker.pyc b/classifiers/region_ranker/perfect_ranker.pyc index 04058d905a9d6519d32eafa0d15668d4dd4f9191..ac7c5aca66999387ddddd8c1307cd88b3c37391b 100644 Binary files a/classifiers/region_ranker/perfect_ranker.pyc and b/classifiers/region_ranker/perfect_ranker.pyc differ diff --git a/classifiers/tf_graph_creation_helper.py b/classifiers/tf_graph_creation_helper.py index 71240aaae6269ca63eae75955f3d0e5a5d2b744b..1b18cc04f8761e8f7e20b0ffed0c044627280c98 100644 --- a/classifiers/tf_graph_creation_helper.py +++ b/classifiers/tf_graph_creation_helper.py @@ -35,13 +35,15 @@ def placeholder_inputs_ans(total_vocab_size, ans_vocab_size, mode='gt'): image_regions = tf.placeholder(tf.float32, shape=[None,25,25,3]) keep_prob = tf.placeholder(tf.float32) questions = tf.placeholder(tf.float32, shape=[None,total_vocab_size]) + region_score = tf.placeholder(tf.float32, shape=[1,None]) + if mode == 'gt': print 'Creating placeholder for ground truth' gt_answer = tf.placeholder(tf.float32, shape=[None, ans_vocab_size]) - return (image_regions, questions, keep_prob, gt_answer) + return (image_regions, questions, keep_prob, gt_answer, region_score) if mode == 'no_gt': print 'No placeholder for ground truth' - return (image_regions, questions, keep_prob) + return (image_regions, questions, keep_prob, region_score) def obj_comp_graph(x, keep_prob): @@ -93,15 +95,14 @@ def atr_comp_graph(x, keep_prob, obj_feat): def ans_comp_graph(image_regions, questions, keep_prob, \ obj_feat, atr_feat, vocab, inv_vocab, ans_vocab_size): with tf.name_scope('ans') as ans_graph: - with tf.name_scope('word_embed') as word_embed: - initial = tf.random_uniform(shape=[len(vocab),100], minval=0, maxval=1) + initial = tf.truncated_normal(shape=[len(vocab),100], stddev=0.1) +# initial = tf.random_uniform(shape=[len(vocab),100], minval=0, maxval=1) word_vecs = tf.Variable(initial, name='word_vecs') with tf.name_scope('q_embed') as q_embed: q_feat = tf.matmul(questions, word_vecs) - num_words = tf.reduce_sum(questions, 1, keep_dims=True) - q_feat = tf.truediv(q_feat, num_words) +# q_feat = tf.truediv(q_feat, tf.cast(len(vocab),tf.float32)) with tf.name_scope('conv1') as conv1: W_conv1 = weight_variable([5,5,3,4]) @@ -135,6 +136,15 @@ def ans_comp_graph(image_regions, questions, keep_prob, \ return y_pred +def aggregate_y_pred(y_pred, region_score, batch_size, num_proposals, ans_vocab_size): + y_pred_list = tf.split(0, batch_size, y_pred) + region_score_list = tf.split(1, batch_size, region_score) + y_avg_list = [] + for i in xrange(batch_size): + y_avg_list.append(tf.matmul(region_score_list[i],y_pred_list[i])) + y_avg = tf.concat(0, y_avg_list) + return y_avg + def evaluation(y, y_pred): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_pred, 1), name='correct_prediction') accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') diff --git a/classifiers/tf_graph_creation_helper.pyc b/classifiers/tf_graph_creation_helper.pyc index 0a2ca99d6b8999cdfb57a84d3560b6204947ad8c..cb966f7932fe3d93f724fcdb8c59eb25570ee614 100644 Binary files a/classifiers/tf_graph_creation_helper.pyc and b/classifiers/tf_graph_creation_helper.pyc differ