Skip to content
Snippets Groups Projects
visual_genome_parser.py 18.15 KiB
import json
import os
import operator
import nltk
import numpy as np
import pdb
from multiprocessing import Pool
import image_io


# Filenames
_datadir = '/home/ssd/VisualGenome/'
_outdir =  os.path.join(_datadir,'restructured')
_cropped_regions_dir = os.path.join(_datadir, 'cropped_regions')
_objects = 'objects.json'
_attributes = 'attributes.json'
_objects_in_image = 'objects_in_image.json'
_regions_in_image = 'regions_in_image.json'
_regions_with_attributes = 'regions_with_attributes.json'
_region_descriptions = 'region_descriptions.json'
_question_answers = 'question_answers.json'
_regions = 'regions.json'
_raw_object_labels = 'raw_object_labels.json'
_raw_attribute_labels = 'raw_attribute_labels.json'
_object_labels = 'object_labels.json'
_attribute_labels = 'attribute_labels.json'
_regions_with_labels = 'region_with_labels.json'
_unknown_token = 'UNK'
_unopenable_images = 'unopenable_images.json'
_vocab = 'vocab.json'
_answer_vocab = 'answer_vocab.json'
_vocab_subset = 'vocab_subset.json'
_answer_vocab_subset = 'answer_vocab_subset.json'
_im_w = 80
_im_h = 80
_pool_size = 10

if not os.path.exists(_outdir):
    os.mkdir(_outdir)

def parse_objects():
    filename = os.path.join(_datadir, _objects)
    with open(filename,'r') as file:
        data = json.load(file)

    objects_in_image = dict()
    for item in data:
        objects_in_image[item['id']] = item['objects']

    regions_in_image = dict()
    for item in data:
        region_ids = [object['id'] for object in item['objects']]
        regions_in_image[item['id']] = region_ids

    objects_in_image_out_filename = os.path.join(_outdir, 
                                                 _objects_in_image)
    with open(objects_in_image_out_filename, 'w') as outfile:
        json.dump(objects_in_image, outfile, sort_keys=True, indent=4)
        
    regions_in_image_out_filename = os.path.join(_outdir, 
                                                 _regions_in_image)
    with open(regions_in_image_out_filename, 'w') as outfile:
        json.dump(regions_in_image, outfile, sort_keys=True, indent=4)

    
def parse_attributes():
    filename = os.path.join(_datadir, _attributes)
    with open(filename,'r') as file:
        data = json.load(file)

    regions = dict()
    for image_data in data:
        for region_data in image_data['attributes']:
            region_data_without_id = dict()
            region_data_without_id['image_id'] = image_data['id']
            for key, value in region_data.items():
                if key != 'id':
                    region_data_without_id[key] = value
            regions[region_data['id']] = region_data_without_id
            
    regions_out_filename = os.path.join(_outdir, 
                                        _regions_with_attributes)
    with open(regions_out_filename, 'w') as outfile:
        json.dump(regions, outfile, sort_keys=True, indent=4)


def add_regions_without_attributes():
    regions_with_attributes_filename = os.path.join(_outdir,
                                                    _regions_with_attributes)
    with open(regions_with_attributes_filename) as file:
        regions_with_attributes_data = json.load(file)
    
    objects_in_image_filename = os.path.join(_outdir,
                                             _objects_in_image)
    with open(objects_in_image_filename) as file:
        objects_in_image_data = json.load(file)

    regions = regions_with_attributes_data
    for image_id, object_regions in objects_in_image_data.items():
        for object_region in object_regions:
            if str(object_region['id']) not in regions_with_attributes_data:
                region_data_without_id = dict()
                region_data_without_id['image_id'] = int(image_id)
                region_data_without_id['attributes'] = []
                for key, value in object_region.items():
                    if key != 'id':
                        region_data_without_id[key] = value
                regions[object_region['id']] = region_data_without_id

    regions_out_filename = os.path.join(_outdir, 
                                        _regions)
    with open(regions_out_filename, 'w') as outfile:
        json.dump(regions, outfile, sort_keys=True, indent=4)


def stats():
    regions_filename = os.path.join(_outdir, _regions)
    with open(regions_filename) as file:
        regions = json.load(file)
    
    num_regions = len(regions)
    num_regions_with_attributes = 0
    for region in regions.values():
        if region['attributes']:
            num_regions_with_attributes += 1

    print 'Number of regions: {}'.format(num_regions)
    print 'Number of regions with attributes: {}'.format(
        num_regions_with_attributes)


def normalize_object_label(label, lemmatizer):
    words = nltk.tokenize.word_tokenize(label)
    nouns = []
    for word, pos_tag in nltk.pos_tag(words):
        if pos_tag=='NN' or pos_tag=='NNS':
            nouns.append(lemmatizer.lemmatize(word))
    return " ".join(nouns).lower()

            
def normalize_attribute_label(label, tokenizer):
    words = tokenizer.tokenize(label)
    attributes = []
    for word in words:
        attributes.append("".join(word))
    return " ".join(attributes).lower()


def normalized_labels():
    regions_with_attributes_filename = os.path.join(_outdir,
                                                    _regions_with_attributes)
    with open(regions_with_attributes_filename) as file:
        regions = json.load(file)

    object_labels = dict()
    attribute_labels = dict()
    object_count = 0
    attribute_count = 0
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
    for region_id, region_data in regions.items():
        for object in region_data['object_names']:
            object = normalize_object_label(object, lemmatizer)
            if object not in object_labels:
                object_labels[object] = 1
                object_count += 1
            else:
                object_labels[object] += 1
        
        for attribute in region_data['attributes']:
            attribute = normalize_attribute_label(attribute, tokenizer)
            if attribute not in attribute_labels:
                attribute_labels[attribute] = 1
                attribute_count += 1
            else:
                attribute_labels[attribute] += 1
        
        print 'Objects: {}    Attributes: {}'.format(object_count, 
                                                     attribute_count)
    object_labels_out_filename = os.path.join(_outdir, 
                                              _raw_object_labels)
    with open(object_labels_out_filename, 'w') as outfile:
        json.dump(object_labels, outfile, sort_keys=True, indent=4)

    attribute_labels_out_filename = os.path.join(_outdir, 
                                                 _raw_attribute_labels)
    with open(attribute_labels_out_filename, 'w') as outfile:
        json.dump(attribute_labels, outfile, sort_keys=True, indent=4)

    print 'Number of object labels: {}'.format(object_count)
    print 'Number of attribute labels: {}'.format(attribute_count)



def normalize_region_object_attribute_labels():
    regions_with_attributes_filename = os.path.join(_outdir,
                                                    _regions_with_attributes)
    with open(regions_with_attributes_filename) as file:
        regions = json.load(file)

    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
    count = 0
    for region_id, region_data in regions.items():
        object_names = []
        for object in region_data['object_names']:
            if object_names == "":
                continue
            object_names.append(normalize_object_label(object, lemmatizer))
        region_data['object_names'] = object_names

        attributes = []
        for attribute in region_data['attributes']:
            if attributes == "":
                continue
            attributes.append(normalize_attribute_label(attribute, tokenizer))
        region_data['attributes'] = attributes
        count += 1
        print '{}/{}'.format(count, len(regions))
    regions_with_labels_out_filename = os.path.join(_outdir,
                                                    _regions_with_labels)
    with open(regions_with_labels_out_filename, 'w') as outfile:
        json.dump(regions, outfile, sort_keys=True, indent=4)
    

def top_k_object_labels(k):
    raw_object_labels_filename = os.path.join(_outdir,
                                              _raw_object_labels)
    with open(raw_object_labels_filename, 'r') as file:
        raw_object_labels = json.load(file)
        
    sorted_raw_object_labels = \
        [key for key, value in sorted(raw_object_labels.items(), 
                              key = operator.itemgetter(1),
                              reverse = True)]

    object_labels = dict()
    for i in xrange(min(k,len(sorted_raw_object_labels))):
        object_labels[sorted_raw_object_labels[i]] = i

    if "" in object_labels:
        object_labels[_unknown_token] = object_labels[""]
        del object_labels[""]

    object_labels_filename = os.path.join(_outdir,
                                          _object_labels)
    with open(object_labels_filename, 'w') as outfile:
        json.dump(object_labels, outfile, sort_keys=True, indent=4)


def top_k_attribute_labels(k):
    raw_attribute_labels_filename = os.path.join(_outdir,
                                                 _raw_attribute_labels)
    with open(raw_attribute_labels_filename, 'r') as file:
        raw_attribute_labels = json.load(file)
        
    sorted_raw_attribute_labels = \
        [key for key, value in sorted(raw_attribute_labels.items(), 
                              key = operator.itemgetter(1),
                              reverse = True)]

    attribute_labels = dict()
    for i in xrange(min(k,len(sorted_raw_attribute_labels))):
        attribute_labels[sorted_raw_attribute_labels[i]] = i

    if "" in attribute_labels:
        attribute_labels[_unknown_token] = attribute_labels[""]
        del attribute_labels[""]

    attribute_labels_filename = os.path.join(_outdir,
                                             _attribute_labels)
    with open(attribute_labels_filename, 'w') as outfile:
        json.dump(attribute_labels, outfile, sort_keys=True, indent=4)
 

def crop_region(region_info):
    region_id, region_data = region_info
    image_filename = os.path.join(_datadir, 
                                  'images/' + 
                                  str(region_data['image_id']) + '.jpg')
    image_subdir = os.path.join(_cropped_regions_dir, 
                                str(region_data['image_id']))
    image_out_filename = os.path.join(image_subdir, 
                                      str(region_id) + '.jpg')
    if os.path.exists(image_out_filename):
        return

    if not os.path.exists(image_subdir):
        os.mkdir(image_subdir)

    try:
        image = image_io.imread(image_filename)
    except:
        print 'Could not read image: {}'.format(image_filename)
        return
    
    if len(image.shape)==3:
        im_h, im_w, im_c =image.shape
    elif len(image.shape)==2:
        im_h, im_w =image.shape
        image_tmp = np.zeros([im_h, im_w, 3], dtype=image.dtype)        
        for c in xrange(3):
            image_tmp[:,:,c] = image
        image = image_tmp

    x = min(im_w-1,max(0,region_data["x"]))
    y = min(im_h-1,max(0,region_data["y"]))
    h = min(im_h-y,max(region_data["h"],1))
    w = min(im_w-x,max(region_data["w"],1))
    
    cropped_region = image_io.imresize(image[y:y+h,x:x+w,:],
                                       output_size=(_im_h, _im_w))
    image_io.imwrite(cropped_region, image_out_filename)


def crop_regions_parallel():
    regions_filename = os.path.join(_outdir,
                                    _regions_with_labels)
    with open(regions_filename) as file:
        regions = json.load(file)
        
    if not os.path.exists(_cropped_regions_dir):
        os.mkdir(_cropped_regions_dir)

    pool = Pool(_pool_size)
    try:
        pool.map(crop_region, regions.items())
    except:
        pool.close()
        raise
    pool.close()


def crop_regions():
    regions_filename = os.path.join(_outdir,
                                    _regions_with_labels)
    with open(regions_filename) as file:
        regions = json.load(file)
        
    if not os.path.exists(_cropped_regions_dir):
        os.mkdir(_cropped_regions_dir)

    count = 0
    for region_id, region_data in regions.items():
        try:
            image_filename = os.path.join(_datadir, 
                                          'images/' + 
                                          str(region_data['image_id']) + '.jpg')
            image = image_io.imread(image_filename)
            
            if len(image.shape)==3:
                im_h, im_w, im_c =image.shape
            elif len(image.shape)==2:
                im_h, im_w =image.shape
                image_tmp = np.zeros([im_h, im_w, 3], dtype=image.dtype)
                for c in xrange(3):
                    image_tmp[:,:,c] = image
                image = image_tmp

            x = min(im_w-1,max(0,region_data["x"]))
            y = min(im_h-1,max(0,region_data["y"]))
            h = min(im_h-y,max(region_data["h"],1))
            w = min(im_w-x,max(region_data["w"],1))

            cropped_region = image_io.imresize(image[y:y+h,x:x+w,:],
                                               output_size=(_im_h, _im_w))
            image_subdir = os.path.join(_cropped_regions_dir, 
                                        str(region_data['image_id']))
            if not os.path.exists(image_subdir):
                os.mkdir(image_subdir)

            image_out_filename = os.path.join(image_subdir, 
                                              str(region_id) + '.jpg')
        
            image_io.imwrite(cropped_region, image_out_filename)
        
            count += 1
            
            print '{}/{}'.format(count, len(regions))
        except:
            print region_id, region_data
            raise


def construct_vocabulary():
    question_answers_filename = os.path.join(_datadir, _question_answers)
    with open(question_answers_filename) as file:
        question_answers = json.load(file)
    
    vocab = dict()
    answer_vocab = dict()
    tokenizer = nltk.tokenize.RegexpTokenizer("([^\W\d]+'[^\W\d]+)|([^\W\d]+)")
#    tokenizer = nltk.tokenize.RegexpTokenizer("[^-?.,:* \d\"]+")
    for image_qas in question_answers:
        for qa in image_qas['qas']:
            answer_words = tokenizer.tokenize(qa['answer'])
            question_words = tokenizer.tokenize(qa['question'])
            for word in question_words + answer_words:
                word_lower ="".join(word).lower()
                if word_lower in vocab:
                    vocab[word_lower] += 1
                else:
                    vocab[word_lower] = 1
    
            answer = []
            for word in answer_words:
                answer.append("".join(word).lower()) 
            answer = " ".join(answer)
            if answer in answer_vocab:
                answer_vocab[answer] += 1
            else:
                answer_vocab[answer] = 1

    vocab_filename = os.path.join(_outdir, _vocab)
    with open(vocab_filename, 'w') as outfile:
        json.dump(vocab, outfile, sort_keys=True, indent=4)

    answer_vocab_filename = os.path.join(_outdir, _answer_vocab)
    with open(answer_vocab_filename, 'w') as outfile:
        json.dump(answer_vocab, outfile, sort_keys=True, indent=4)
    
    print "Vocab Size: {}".format(len(vocab))
    print "Answer Vocab Size: {}".format(len(answer_vocab))


def select_vocab_subset(k):
    vocab_filename = os.path.join(_outdir, _vocab)
    with open(vocab_filename, 'r') as file:
        vocab = json.load(file)
        
    sorted_vocab = \
        [key for key, value in sorted(vocab.items(), 
                                      key = operator.itemgetter(1),
                                      reverse = True)]

    vocab_subset = dict()
    vocab_subset_chars_only_size = min(k,len(sorted_vocab))

    for i in xrange(vocab_subset_chars_only_size):
        vocab_subset[sorted_vocab[i]] = i
    vocab_subset[_unknown_token] = vocab_subset_chars_only_size

    
    for i in xrange(10):
        str_i = str(i)
        vocab_subset[str_i] = vocab_subset_chars_only_size + i

    vocab_subset_filename = os.path.join(_outdir, _vocab_subset)
    with open(vocab_subset_filename, 'w') as outfile:
        json.dump(vocab_subset, outfile, sort_keys=True, indent=4)

    recalled = 0.0
    not_recalled = 0.0
    for word in vocab:
        if word in vocab_subset:
            recalled += vocab[word]
        else:
            not_recalled += vocab[word]

    print 'Recall: {}'.format(recalled/(recalled + not_recalled))

def select_answer_subset(k):
    answer_vocab_filename = os.path.join(_outdir, _answer_vocab)
    with open(answer_vocab_filename, 'r') as file:
        answer_vocab = json.load(file)
        
    sorted_answer_vocab = \
        [key for key, value in sorted(answer_vocab.items(), 
                                      key = operator.itemgetter(1),
                                      reverse = True)]

    answer_vocab_subset = dict()
    for i in xrange(min(k,len(sorted_answer_vocab))):
        answer_vocab_subset[sorted_answer_vocab[i]] = i

    answer_vocab_subset_filename = os.path.join(_outdir, _answer_vocab_subset)
    with open(answer_vocab_subset_filename, 'w') as outfile:
        json.dump(answer_vocab_subset, outfile, sort_keys=True, indent=4)

    recalled = 0.0
    not_recalled = 0.0
    for answer in answer_vocab:
        if answer in answer_vocab_subset:
            recalled += answer_vocab[answer]
        else:
            print answer
            not_recalled += answer_vocab[answer]

    print 'Recall: {}'.format(recalled/(recalled + not_recalled))
    

if __name__=='__main__':
    # parse_objects()
    # parse_attributes()
    # add_regions_without_attributes()
    # stats()
    # normalized_labels()
    # normalize_region_object_attribute_labels()
    # top_k_object_labels(1000)
    # top_k_attribute_labels(1000)
    crop_regions_parallel()
    # construct_vocabulary()
    # select_vocab_subset(10000)
    # select_answer_subset(5000)