From d80c835a54f30bd1e8805217636d7f992e32b6fd Mon Sep 17 00:00:00 2001 From: tgupta6 <tgupta6@illinois.edu> Date: Sat, 20 Aug 2016 15:01:22 -0500 Subject: [PATCH] Circular batch and train held out --- answer_classifier_cached_features/eval.py | 23 ++++++++++--------- .../select_best_model.py | 21 +++++++++-------- answer_classifier_cached_features/train.py | 2 +- constants_crunchy.py | 5 ++-- tftools/data.py | 10 ++++---- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/answer_classifier_cached_features/eval.py b/answer_classifier_cached_features/eval.py index 09d5e6d..5e32665 100644 --- a/answer_classifier_cached_features/eval.py +++ b/answer_classifier_cached_features/eval.py @@ -38,23 +38,18 @@ def create_initializer(graph, sess, model): return initializer() def create_batch_generator(mode): - if mode=='val_subset': + if mode=='val': vqa_resnet_feat_dir = constants.vqa_val_resnet_feat_dir vqa_anno = constants.vqa_val_anno - num_questions = constants.num_val_subset_questions + num_questions = constants.num_val_questions offset = 0 - elif mode=='val_rest': + elif mode=='val_subset': vqa_resnet_feat_dir = constants.vqa_val_resnet_feat_dir vqa_anno = constants.vqa_val_anno - num_questions = constants.num_val_rest_questions - offset = constants.num_val_subset_questions - elif mode=='train': - vqa_resnet_feat_dir = constants.vqa_train_resnet_feat_dir - vqa_anno = constants.vqa_train_anno - num_questions = constants.num_train_questions + num_questions = constants.num_val_subset_questions offset = 0 else: - print "mode needs to be one of {'train','val_subset','val_rest'}, found " + mode + print "mode needs to be one of {'val','val_subset'}, found " + mode data_mgr = vqa_data.data( vqa_resnet_feat_dir, @@ -143,6 +138,7 @@ class eval_mgr(): self.correct = 0 self.total = 0 self.results = [] + self.seen_qids = set() def eval(self, iter, eval_vars_dict, batch): batch_size = len(batch['question_unencoded']) @@ -173,7 +169,12 @@ class eval_mgr(): 'question_id': int(question_id), 'answer': pred_answer } - self.results.append(result_entry) + + if question_id not in self.seen_qids: + self.seen_qids.add(question_id) + self.results.append(result_entry) + else: + print 'Already evaluated on this sample' self.eval_data[str(question_id)] = dict_entry diff --git a/answer_classifier_cached_features/select_best_model.py b/answer_classifier_cached_features/select_best_model.py index ee6b270..f483337 100644 --- a/answer_classifier_cached_features/select_best_model.py +++ b/answer_classifier_cached_features/select_best_model.py @@ -39,16 +39,13 @@ def create_initializer(graph, sess, model): return initializer() def create_batch_generator(mode): - if mode=='val': - vqa_resnet_feat_dir = constants.vqa_val_resnet_feat_dir - vqa_anno = constants.vqa_val_anno - num_questions = constants.num_val_questions - elif mode=='train': + if mode=='train_subset': vqa_resnet_feat_dir = constants.vqa_train_resnet_feat_dir vqa_anno = constants.vqa_train_anno - num_questions = constants.num_train_questions + num_questions = constants.num_train_held_out_questions + offset = constants.num_train_subset_questions else: - print "mode needs to be one of {'train','test','val'}, found " + mode + print "mode needs to be one of {'train_subset'}, found " + mode data_mgr = vqa_data.data( vqa_resnet_feat_dir, @@ -64,7 +61,7 @@ def create_batch_generator(mode): constants.answer_batch_size, num_questions, 1, - 0) + offset) batch_generator = tftools.data.async_batch_generator( data_mgr, @@ -135,6 +132,7 @@ class eval_mgr(): self.correct = 0 self.total = 0 self.results = [] + self.seen_qids = set() def eval(self, iter, eval_vars_dict, batch): batch_size = len(batch['question_unencoded']) @@ -151,7 +149,12 @@ class eval_mgr(): 'question_id': int(question_id), 'answer': pred_answer } - self.results.append(result_entry) + + if question_id not in self.seen_qids: + self.seen_qids.add(question_id) + self.results.append(result_entry) + else: + print 'Already evaluated on this sample' self.total += batch_size diff --git a/answer_classifier_cached_features/train.py b/answer_classifier_cached_features/train.py index 608516a..63dfdf3 100644 --- a/answer_classifier_cached_features/train.py +++ b/answer_classifier_cached_features/train.py @@ -475,7 +475,7 @@ def create_vqa_batch_generator(): index_generator = tftools.data.random( constants.answer_batch_size, - constants.num_train_questions, + constants.num_train_subset_questions, constants.answer_num_epochs, constants.answer_offset) diff --git a/constants_crunchy.py b/constants_crunchy.py index ac7d315..df19281 100644 --- a/constants_crunchy.py +++ b/constants_crunchy.py @@ -154,9 +154,10 @@ vqa_answer_vocab_json = os.path.join( # VQA dataset params num_train_questions = 248349 -num_val_subset_questions = 10000 +num_train_held_out_questions = 24835 +num_train_subset_questions = num_train_question - num_train_held_out_questions num_val_questions = 121512 -num_val_rest_questions = num_val_questions - num_val_subset_questions +num_val_subset_questions = 10000 num_test_questions = 0 diff --git a/tftools/data.py b/tftools/data.py index 1a60f44..a872a06 100644 --- a/tftools/data.py +++ b/tftools/data.py @@ -6,21 +6,23 @@ import time def sequential(batch_size, num_samples, num_epochs=1, offset=0): """Generate sequence indices. """ + num_samples_ = int(batch_size*np.ceil(num_samples/float(batch_size))) for epoch in range(num_epochs): - indices = np.arange(num_samples) + offset + indices = np.arange(num_samples_)%num_samples + offset indices = indices.tolist() - for i in range(0, num_samples - batch_size + 1, batch_size): + for i in range(0, num_samples_ - batch_size + 1, batch_size): yield indices[i:i+batch_size] def random(batch_size, num_samples, num_epochs, offset=0): """Generate random indices. """ + num_samples_ = int(batch_size*np.ceil(num_samples/float(batch_size))) for epoch in range(num_epochs): # np.random.seed(epoch) - indices = np.random.permutation(num_samples) + offset + indices = np.random.permutation(num_samples_)%num_samples + offset indices = indices.tolist() - for i in range(0, num_samples - batch_size + 1, batch_size): + for i in range(0, num_samples_ - batch_size + 1, batch_size): yield indices[i:i+batch_size] -- GitLab