Something went wrong on our end
visual_genome_parser.py 18.15 KiB
import json
import os
import operator
import nltk
import numpy as np
import pdb
from multiprocessing import Pool
import image_io
# Filenames
_datadir = '/home/ssd/VisualGenome/'
_outdir = os.path.join(_datadir,'restructured')
_cropped_regions_dir = os.path.join(_datadir, 'cropped_regions')
_objects = 'objects.json'
_attributes = 'attributes.json'
_objects_in_image = 'objects_in_image.json'
_regions_in_image = 'regions_in_image.json'
_regions_with_attributes = 'regions_with_attributes.json'
_region_descriptions = 'region_descriptions.json'
_question_answers = 'question_answers.json'
_regions = 'regions.json'
_raw_object_labels = 'raw_object_labels.json'
_raw_attribute_labels = 'raw_attribute_labels.json'
_object_labels = 'object_labels.json'
_attribute_labels = 'attribute_labels.json'
_regions_with_labels = 'region_with_labels.json'
_unknown_token = 'UNK'
_unopenable_images = 'unopenable_images.json'
_vocab = 'vocab.json'
_answer_vocab = 'answer_vocab.json'
_vocab_subset = 'vocab_subset.json'
_answer_vocab_subset = 'answer_vocab_subset.json'
_im_w = 80
_im_h = 80
_pool_size = 10
if not os.path.exists(_outdir):
os.mkdir(_outdir)
def parse_objects():
filename = os.path.join(_datadir, _objects)
with open(filename,'r') as file:
data = json.load(file)
objects_in_image = dict()
for item in data:
objects_in_image[item['id']] = item['objects']
regions_in_image = dict()
for item in data:
region_ids = [object['id'] for object in item['objects']]
regions_in_image[item['id']] = region_ids
objects_in_image_out_filename = os.path.join(_outdir,
_objects_in_image)
with open(objects_in_image_out_filename, 'w') as outfile:
json.dump(objects_in_image, outfile, sort_keys=True, indent=4)
regions_in_image_out_filename = os.path.join(_outdir,
_regions_in_image)
with open(regions_in_image_out_filename, 'w') as outfile:
json.dump(regions_in_image, outfile, sort_keys=True, indent=4)
def parse_attributes():
filename = os.path.join(_datadir, _attributes)
with open(filename,'r') as file:
data = json.load(file)
regions = dict()
for image_data in data:
for region_data in image_data['attributes']:
region_data_without_id = dict()
region_data_without_id['image_id'] = image_data['id']
for key, value in region_data.items():
if key != 'id':
region_data_without_id[key] = value
regions[region_data['id']] = region_data_without_id
regions_out_filename = os.path.join(_outdir,
_regions_with_attributes)
with open(regions_out_filename, 'w') as outfile:
json.dump(regions, outfile, sort_keys=True, indent=4)
def add_regions_without_attributes():
regions_with_attributes_filename = os.path.join(_outdir,
_regions_with_attributes)
with open(regions_with_attributes_filename) as file:
regions_with_attributes_data = json.load(file)
objects_in_image_filename = os.path.join(_outdir,
_objects_in_image)
with open(objects_in_image_filename) as file:
objects_in_image_data = json.load(file)
regions = regions_with_attributes_data
for image_id, object_regions in objects_in_image_data.items():
for object_region in object_regions:
if str(object_region['id']) not in regions_with_attributes_data:
region_data_without_id = dict()
region_data_without_id['image_id'] = int(image_id)
region_data_without_id['attributes'] = []
for key, value in object_region.items():
if key != 'id':
region_data_without_id[key] = value
regions[object_region['id']] = region_data_without_id
regions_out_filename = os.path.join(_outdir,
_regions)
with open(regions_out_filename, 'w') as outfile:
json.dump(regions, outfile, sort_keys=True, indent=4)
def stats():
regions_filename = os.path.join(_outdir, _regions)
with open(regions_filename) as file:
regions = json.load(file)
num_regions = len(regions)
num_regions_with_attributes = 0
for region in regions.values():
if region['attributes']:
num_regions_with_attributes += 1
print 'Number of regions: {}'.format(num_regions)
print 'Number of regions with attributes: {}'.format(
num_regions_with_attributes)
def normalize_object_label(label, lemmatizer):
words = nltk.tokenize.word_tokenize(label)
nouns = []
for word, pos_tag in nltk.pos_tag(words):
if pos_tag=='NN' or pos_tag=='NNS':
nouns.append(lemmatizer.lemmatize(word))
return " ".join(nouns).lower()
def normalize_attribute_label(label, tokenizer):
words = tokenizer.tokenize(label)
attributes = []
for word in words:
attributes.append("".join(word))
return " ".join(attributes).lower()
def normalized_labels():
regions_with_attributes_filename = os.path.join(_outdir,
_regions_with_attributes)
with open(regions_with_attributes_filename) as file:
regions = json.load(file)
object_labels = dict()
attribute_labels = dict()
object_count = 0
attribute_count = 0
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
for region_id, region_data in regions.items():
for object in region_data['object_names']:
object = normalize_object_label(object, lemmatizer)
if object not in object_labels:
object_labels[object] = 1
object_count += 1
else:
object_labels[object] += 1
for attribute in region_data['attributes']:
attribute = normalize_attribute_label(attribute, tokenizer)
if attribute not in attribute_labels:
attribute_labels[attribute] = 1
attribute_count += 1
else:
attribute_labels[attribute] += 1
print 'Objects: {} Attributes: {}'.format(object_count,
attribute_count)
object_labels_out_filename = os.path.join(_outdir,
_raw_object_labels)
with open(object_labels_out_filename, 'w') as outfile:
json.dump(object_labels, outfile, sort_keys=True, indent=4)
attribute_labels_out_filename = os.path.join(_outdir,
_raw_attribute_labels)
with open(attribute_labels_out_filename, 'w') as outfile:
json.dump(attribute_labels, outfile, sort_keys=True, indent=4)
print 'Number of object labels: {}'.format(object_count)
print 'Number of attribute labels: {}'.format(attribute_count)
def normalize_region_object_attribute_labels():
regions_with_attributes_filename = os.path.join(_outdir,
_regions_with_attributes)
with open(regions_with_attributes_filename) as file:
regions = json.load(file)
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
count = 0
for region_id, region_data in regions.items():
object_names = []
for object in region_data['object_names']:
if object_names == "":
continue
object_names.append(normalize_object_label(object, lemmatizer))
region_data['object_names'] = object_names
attributes = []
for attribute in region_data['attributes']:
if attributes == "":
continue
attributes.append(normalize_attribute_label(attribute, tokenizer))
region_data['attributes'] = attributes
count += 1
print '{}/{}'.format(count, len(regions))
regions_with_labels_out_filename = os.path.join(_outdir,
_regions_with_labels)
with open(regions_with_labels_out_filename, 'w') as outfile:
json.dump(regions, outfile, sort_keys=True, indent=4)
def top_k_object_labels(k):
raw_object_labels_filename = os.path.join(_outdir,
_raw_object_labels)
with open(raw_object_labels_filename, 'r') as file:
raw_object_labels = json.load(file)
sorted_raw_object_labels = \
[key for key, value in sorted(raw_object_labels.items(),
key = operator.itemgetter(1),
reverse = True)]
object_labels = dict()
for i in xrange(min(k,len(sorted_raw_object_labels))):
object_labels[sorted_raw_object_labels[i]] = i
if "" in object_labels:
object_labels[_unknown_token] = object_labels[""]
del object_labels[""]
object_labels_filename = os.path.join(_outdir,
_object_labels)
with open(object_labels_filename, 'w') as outfile:
json.dump(object_labels, outfile, sort_keys=True, indent=4)
def top_k_attribute_labels(k):
raw_attribute_labels_filename = os.path.join(_outdir,
_raw_attribute_labels)
with open(raw_attribute_labels_filename, 'r') as file:
raw_attribute_labels = json.load(file)
sorted_raw_attribute_labels = \
[key for key, value in sorted(raw_attribute_labels.items(),
key = operator.itemgetter(1),
reverse = True)]
attribute_labels = dict()
for i in xrange(min(k,len(sorted_raw_attribute_labels))):
attribute_labels[sorted_raw_attribute_labels[i]] = i
if "" in attribute_labels:
attribute_labels[_unknown_token] = attribute_labels[""]
del attribute_labels[""]
attribute_labels_filename = os.path.join(_outdir,
_attribute_labels)
with open(attribute_labels_filename, 'w') as outfile:
json.dump(attribute_labels, outfile, sort_keys=True, indent=4)
def crop_region(region_info):
region_id, region_data = region_info
image_filename = os.path.join(_datadir,
'images/' +
str(region_data['image_id']) + '.jpg')
image_subdir = os.path.join(_cropped_regions_dir,
str(region_data['image_id']))
image_out_filename = os.path.join(image_subdir,
str(region_id) + '.jpg')
if os.path.exists(image_out_filename):
return
if not os.path.exists(image_subdir):
os.mkdir(image_subdir)
try:
image = image_io.imread(image_filename)
except:
print 'Could not read image: {}'.format(image_filename)
return
if len(image.shape)==3:
im_h, im_w, im_c =image.shape
elif len(image.shape)==2:
im_h, im_w =image.shape
image_tmp = np.zeros([im_h, im_w, 3], dtype=image.dtype)
for c in xrange(3):
image_tmp[:,:,c] = image
image = image_tmp
x = min(im_w-1,max(0,region_data["x"]))
y = min(im_h-1,max(0,region_data["y"]))
h = min(im_h-y,max(region_data["h"],1))
w = min(im_w-x,max(region_data["w"],1))
cropped_region = image_io.imresize(image[y:y+h,x:x+w,:],
output_size=(_im_h, _im_w))
image_io.imwrite(cropped_region, image_out_filename)
def crop_regions_parallel():
regions_filename = os.path.join(_outdir,
_regions_with_labels)
with open(regions_filename) as file:
regions = json.load(file)
if not os.path.exists(_cropped_regions_dir):
os.mkdir(_cropped_regions_dir)
pool = Pool(_pool_size)
try:
pool.map(crop_region, regions.items())
except:
pool.close()
raise
pool.close()
def crop_regions():
regions_filename = os.path.join(_outdir,
_regions_with_labels)
with open(regions_filename) as file:
regions = json.load(file)
if not os.path.exists(_cropped_regions_dir):
os.mkdir(_cropped_regions_dir)
count = 0
for region_id, region_data in regions.items():
try:
image_filename = os.path.join(_datadir,
'images/' +
str(region_data['image_id']) + '.jpg')
image = image_io.imread(image_filename)
if len(image.shape)==3:
im_h, im_w, im_c =image.shape
elif len(image.shape)==2:
im_h, im_w =image.shape
image_tmp = np.zeros([im_h, im_w, 3], dtype=image.dtype)
for c in xrange(3):
image_tmp[:,:,c] = image
image = image_tmp
x = min(im_w-1,max(0,region_data["x"]))
y = min(im_h-1,max(0,region_data["y"]))
h = min(im_h-y,max(region_data["h"],1))
w = min(im_w-x,max(region_data["w"],1))
cropped_region = image_io.imresize(image[y:y+h,x:x+w,:],
output_size=(_im_h, _im_w))
image_subdir = os.path.join(_cropped_regions_dir,
str(region_data['image_id']))
if not os.path.exists(image_subdir):
os.mkdir(image_subdir)
image_out_filename = os.path.join(image_subdir,
str(region_id) + '.jpg')
image_io.imwrite(cropped_region, image_out_filename)
count += 1
print '{}/{}'.format(count, len(regions))
except:
print region_id, region_data
raise
def construct_vocabulary():
question_answers_filename = os.path.join(_datadir, _question_answers)
with open(question_answers_filename) as file:
question_answers = json.load(file)
vocab = dict()
answer_vocab = dict()
tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
# tokenizer = nltk.tokenize.RegexpTokenizer("[^-?.,:* \d\"]+")
for image_qas in question_answers:
for qa in image_qas['qas']:
answer_words = tokenizer.tokenize(qa['answer'])
question_words = tokenizer.tokenize(qa['question'])
for word in question_words + answer_words:
word_lower ="".join(word).lower()
if word_lower in vocab:
vocab[word_lower] += 1
else:
vocab[word_lower] = 1
answer = []
for word in answer_words:
answer.append("".join(word).lower())
answer = " ".join(answer)
if answer in answer_vocab:
answer_vocab[answer] += 1
else:
answer_vocab[answer] = 1
vocab_filename = os.path.join(_outdir, _vocab)
with open(vocab_filename, 'w') as outfile:
json.dump(vocab, outfile, sort_keys=True, indent=4)
answer_vocab_filename = os.path.join(_outdir, _answer_vocab)
with open(answer_vocab_filename, 'w') as outfile:
json.dump(answer_vocab, outfile, sort_keys=True, indent=4)
print "Vocab Size: {}".format(len(vocab))
print "Answer Vocab Size: {}".format(len(answer_vocab))
def select_vocab_subset(k):
vocab_filename = os.path.join(_outdir, _vocab)
with open(vocab_filename, 'r') as file:
vocab = json.load(file)
sorted_vocab = \
[key for key, value in sorted(vocab.items(),
key = operator.itemgetter(1),
reverse = True)]
vocab_subset = dict()
vocab_subset_chars_only_size = min(k,len(sorted_vocab))
for i in xrange(vocab_subset_chars_only_size):
vocab_subset[sorted_vocab[i]] = i
vocab_subset[_unknown_token] = vocab_subset_chars_only_size
for i in xrange(10):
str_i = str(i)
vocab_subset[str_i] = vocab_subset_chars_only_size + i
vocab_subset_filename = os.path.join(_outdir, _vocab_subset)
with open(vocab_subset_filename, 'w') as outfile:
json.dump(vocab_subset, outfile, sort_keys=True, indent=4)
recalled = 0.0
not_recalled = 0.0
for word in vocab:
if word in vocab_subset:
recalled += vocab[word]
else:
not_recalled += vocab[word]
print 'Recall: {}'.format(recalled/(recalled + not_recalled))
def select_answer_subset(k):
answer_vocab_filename = os.path.join(_outdir, _answer_vocab)
with open(answer_vocab_filename, 'r') as file:
answer_vocab = json.load(file)
sorted_answer_vocab = \
[key for key, value in sorted(answer_vocab.items(),
key = operator.itemgetter(1),
reverse = True)]
answer_vocab_subset = dict()
for i in xrange(min(k,len(sorted_answer_vocab))):
answer_vocab_subset[sorted_answer_vocab[i]] = i
answer_vocab_subset_filename = os.path.join(_outdir, _answer_vocab_subset)
with open(answer_vocab_subset_filename, 'w') as outfile:
json.dump(answer_vocab_subset, outfile, sort_keys=True, indent=4)
recalled = 0.0
not_recalled = 0.0
for answer in answer_vocab:
if answer in answer_vocab_subset:
recalled += answer_vocab[answer]
else:
print answer
not_recalled += answer_vocab[answer]
print 'Recall: {}'.format(recalled/(recalled + not_recalled))
if __name__=='__main__':
# parse_objects()
# parse_attributes()
# add_regions_without_attributes()
# stats()
# normalized_labels()
# normalize_region_object_attribute_labels()
# top_k_object_labels(1000)
# top_k_attribute_labels(1000)
crop_regions_parallel()
# construct_vocabulary()
# select_vocab_subset(10000)
# select_answer_subset(5000)