diff --git a/CommonUtilities.py b/CommonUtilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..400c41e184beb27a4949e89e01ec5034b6e3187f
--- /dev/null
+++ b/CommonUtilities.py
@@ -0,0 +1,4 @@
+class CommonUtilities:
+    @staticmethod
+    def obj_dict(obj):
+        return obj.__dict__
\ No newline at end of file
diff --git a/IndexBuilder.py b/IndexBuilder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5536a29e1e7ec3c97e17fb9dd84666d625d3d93d
--- /dev/null
+++ b/IndexBuilder.py
@@ -0,0 +1,55 @@
+import os, os.path
+from whoosh.index import create_in
+from whoosh.fields import *
+import xml.dom.minidom as dom
+import xml.etree.ElementTree as ET
+from whoosh.qparser import MultifieldParser, OrGroup, QueryParser
+
+class IndexBuilder:
+    def __init__(self):
+        self.schema = Schema(paper=ID(stored=True), abstract=TEXT(stored=True), title=TEXT(stored=True), introduction=TEXT(stored=True))
+        self.ix = create_in("index", schema)
+        self.path = "papers_to_index/"
+    
+    def build(self):
+        file_list = []
+        for (dirpath, dirnames, filenames) in os.walk(path):
+            file_list.extend(filenames)
+        
+        writer = ix.writer()
+        for file in file_list:
+            fileNew = dirpath+file
+            count += 1
+            paperId = os.path.splitext(file)
+            f = open(fileNew, encoding="utf8")
+            for line in f:
+                xmltree = dom.parseString(line)
+                if (xmltree.getElementsByTagName('abstract')):
+                    if  xmltree.getElementsByTagName('abstract')[0].firstChild is None:
+                        abstractField = ""
+                    else:
+                        abstractField = xmltree.getElementsByTagName('abstract')[0].firstChild.nodeValue
+                
+                elif (xmltree.getElementsByTagName('title')):
+                    if xmltree.getElementsByTagName('title')[0].firstChild is None:
+                        titleField = ""
+                    else:
+                        titleField = xmltree.getElementsByTagName('title')[0].firstChild.nodeValue
+
+                elif (xmltree.getElementsByTagName('introduction')):
+                    if (xmltree.getElementsByTagName('introduction')[0].firstChild is None):  
+                        introductionField = ""
+                    else:
+                        introductionField = xmltree.getElementsByTagName('introduction')[0].firstChild.nodeValue
+                
+            writer.add_document(paper = paperId, abstract = abstractField, title = titleField, introduction = introductionField)
+
+        writer.commit()
+        writer.close()
+
+def build():
+    indexbuilder = IndexBuilder()
+    indexbuilder.build()
+
+if __name__== "__main__":
+    build()
diff --git a/README.md b/README.md
index cff810345502c30c3cae93b0b14d430889945a1a..94f64c56b7395d06e4c1ede621b83350440e0104 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,25 @@
 # cs510-project
 
+Before running the software, here are a few points about the data files
+1. We have not provided the dataset and corresponding datafiles explicitly due to size constraints
+2. The dataset (academic papers) need to be placed in a folder â€˜papers_to_indexâ€™
+3. Each document is expected to be an xml file with the following fields:
+        Paper  Abstract	Title	Introduction
+
+4. Some other files are expected to be present before the code can run successfully:
+5. docs.json: a json file consisting of all the documents. Each line represents one document and contains, at the least, keyPhrases, paperAbstract, title, introduction, docno (document number), numKeyReferences (number of key references), numCitedBy and numKeyCitations. These fields are used for generating features for training our neural network.
+6. train_queries.json: Each line is of the form {"qid": "the query id", "query": "the query string", "ana": {the annotated entity id and frequency}} 
+7. train_queries_qrel: relevance judgement for the training queries. 
+8. The above 3 files are from Freebase and are similar to the files used in the searchengine assignment
+9. supervisedTrain.txt: The file which is generated on training the neural net with the training data. Contains the feature values.
+
+The code is provided at the github link given at the beginning of this report
+The UI can be viewed directly by accessing the link at the beginning of the report.
+To run the software, use the following steps:
+1. Assuming that the data is placed as explained above:
+2. To build the index for the dataset, run `python3 IndexBuilder.py`. The index will be built from the documents present in â€˜papers_to_indexâ€™ in the current directory; and creates the index in a folder â€˜indexâ€™. Create an empty folder â€˜indexâ€™ if the code automatically doesnâ€™t create it.
+3. Once the index is built (might take a while), we want to generate our features for training the neural network. Ensure that the necessary files are placed. The files are similar to the files provided for search engine assignment and named â€˜docs.jsonâ€™, â€˜train_queries.jsonâ€™ and â€˜train_queries_qrelâ€™. The features will be created in a file â€˜supervisedTrain.txtâ€™ when the following command is executed: â€˜python3 train_academicdata.pyâ€™
+4. Execute `python3 neuralnetregressor.py` to train the model on the features generated and dumps the model into a file `plsaNeuralNetModel.model`
+5. After the neural net is trained, the software is ready to be run. Execute `python3 SearchEngine.py` which is currently configured to run on localhost.
+6. Open the index.html file on the browser which is configured to connect to the localhost.
+7. Search for the query. Topic labelling will run every time a query is searched, and appropriate results are returned
diff --git a/Search.py b/Search.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5edd738870a22355198040f73f74eb49bbad60e
--- /dev/null
+++ b/Search.py
@@ -0,0 +1,264 @@
+from whoosh.index import create_in, open_dir
+from whoosh.fields import *
+from whoosh.qparser import MultifieldParser, OrGroup, QueryParser
+from whoosh import scoring
+from joblib import load
+from label_topic import TopicLabels
+from collections import defaultdict
+
+class Myclass:
+    def __init__(self):
+        self.abBm25 = 0.0
+        self.introBm25 = 0.0
+        self.tiBm25 = 0.0
+        self.abTf = 0.0
+        self.introTf = 0.0
+        self.tiTf = 0.0
+        self.abPl2 = 0.0
+        self.introPl2 = 0.0
+        self.tiPl2 = 0.0
+        self.abDf = 0.0
+        self.introDf = 0.0
+        self.tiDf = 0.0
+        self.abstarct = ""
+        self.introduction = ""
+        self.title = ""
+        self.paper = ""
+        self.neuralScore = 0.0
+
+class Search:
+   
+    def getResultDocs(self, query):
+        docs = []
+        indexDir = open_dir("index/")
+        og = OrGroup.factory(0.9)
+        indexSearcher = indexDir.searcher()
+
+        # Parser for multiple fields
+        abQueryParser = QueryParser("abstract", indexSearcher.schema, group=og)
+        abQueryObject = abQueryParser.parse(query)
+        introQueryParser = QueryParser("introduction", indexSearcher.schema, group=og)
+        introQueryObject = introQueryParser.parse(query)
+        tiQueryParser = QueryParser("title", indexSearcher.schema, group=og)
+        tiQueryObject = tiQueryParser.parse(query)
+
+        # Extracting features
+        features = defaultdict()
+        abResults = indexSearcher.search(abQueryObject, limit = 300)
+        introResults = indexSearcher.search(introQueryObject, limit = 300)
+        tiResults = indexSearcher.search(tiQueryObject, limit = 300)
+        for result in abResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].abBm25 = result.score
+            else:
+                temp = Myclass()
+                temp.abBm25 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+        
+        for result in introResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].introBm25 = result.score
+            else:
+                temp = Myclass()
+                temp.pabm25 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        for result in tiResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].tiBm25 = result.score
+            else:
+                temp = Myclass()
+                temp.tiBm25 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+        
+        
+        w = scoring.TF_IDF()
+        idfIndexSearcher = indexDir.searcher(weighting=w)
+        abIdfResults = idfIndexSearcher.search(abQueryObject, limit = 300)
+        introIdfResults = idfIndexSearcher.search(introQueryObject, limit = 300)
+        tiIdfResults = idfIndexSearcher.search(tiQueryObject, limit = 300)
+        for result in abIdfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].abTf = result.score
+            else:
+                temp = Myclass()
+                temp.abTf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+        
+        for result in introIdfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].introTf = result.score
+            else:
+                temp = Myclass()
+                temp.introTf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        for result in tiIdfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].tiTf = result.score
+            else:
+                temp = Myclass()
+                temp.tiTf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        w = scoring.PL2()
+        plIndexSearcher = indexDir.searcher(weighting=w)
+        abPlResults = plIndexSearcher.search(abQueryObject, limit = 300)
+        introPlResults = plIndexSearcher.search(introQueryObject, limit = 300)
+        tiPlResults = plIndexSearcher.search(tiQueryObject, limit = 300)
+        for result in abPlResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].abPl2 = result.score
+            else:
+                temp = Myclass()
+                temp.abPl2 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+        
+        for result in introPlResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].introPl2 = result.score
+            else:
+                temp = Myclass()
+                temp.introPl2 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        for result in tiPlResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].tiPl2 = result.score
+            else:
+                temp = Myclass()
+                temp.tiPl2 = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp   
+
+        w = scoring.DFree()
+        dfIndexSearcher = indexDir.searcher(weighting=w)
+        abDfResults = dfIndexSearcher.search(abQueryObject, limit = 300)
+        introDfResults = dfIndexSearcher.search(introQueryObject, limit = 300)
+        tiDfResults = dfIndexSearcher.search(tiQueryObject, limit = 300)
+        for result in abDfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].abDf = result.score
+            else:
+                temp = Myclass()
+                temp.abDf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+        
+        for result in introDfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].introDf = result.score
+            else:
+                temp = Myclass()
+                temp.introDf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        for result in tiDfResults:
+            paper = result["paper"]
+            if (paper in features):
+                features[paper].tiDf = result.score
+            else:
+                temp = Myclass()
+                temp.tiDf = result.score
+                temp.abstarct = result["abstract"]
+                temp.title = result["title"]
+                temp.introduction = result["introduction"]
+                features[paper] = temp
+
+        model = load('plsaNeuralNetModel.model')
+        for key, val in features.items():
+            ab = set(val.abstarct.split())
+            intro = set(val.introduction.split())
+            ti = set(val.title.split())
+            q = set(query.split())
+            val.neuralScore = model.predict([[val.abBm25, val.introBm25, val.tiBm25, \
+                                            val.abTf, val.introTf, val.tiTf, \
+                                            val.abPl2, val.introPl2, val.tiPl2, \
+                                            val.abDf, val.introDf, val.tiDf, \
+                                            len(ab), len(intro), len(ti), len(q), \
+                                            len(q.intersection(ab)), len(q.intersection(intro)), len(q.intersection(ti))]])[0]
+        
+        
+        
+        
+        
+        results = dict(sorted(features.items(), key=lambda x: x[1].neuralScore, reverse=True)[:100])
+        
+        searchResults = list()
+
+        for key, val in results.items():
+            contents = dict()
+            contents["paper"] = key
+            contents["abstract"] = val.abstarct
+            
+            docs.append(val.abstarct)
+            contents["title"] = val.title
+            contents["introduction"] = val.introduction
+            contents["topics"] = ""
+            searchResults.append(contents)
+
+        doc_topics, labels = TopicLabels.get_topic_labels(docs,
+                                n_topics=10,
+                                n_top_words=20,
+                                preprocessing_steps=['tag'],
+                                n_cand_labels=100,
+                                label_min_df=5,
+                                label_tags=['NN,NN', 'JJ,NN'],
+                                n_labels=10,
+                                lda_random_state=12345,
+                                lda_n_iter=400)
+        
+        topic_labels = dict()
+        for key, val in doc_topics.items():
+            for i, label in enumerate(labels):               
+                if i == val:
+                    topic_labels[key] = label
+
+        for i, result in enumerate(searchResults):
+            # TO-DO: format the output, remove stemming
+            result["topics"] = str(topic_labels[i])
+
+        return searchResults
+
diff --git a/SearchEngine.py b/SearchEngine.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fce754a04eb16f5ab221aaf88345c4b4d0c9f25
--- /dev/null
+++ b/SearchEngine.py
@@ -0,0 +1,58 @@
+from flask import Flask, jsonify, Response, send_file, request
+from flask_cors import CORS
+from Search import Search
+from SearchResult import SearchResult
+from CommonUtilities import CommonUtilities
+from logging import Formatter, FileHandler
+import json
+import logging
+
+
+app = Flask(__name__)
+CORS(app)
+logger = None
+
+
+@app.route('/')
+def index():
+    return send_file('index.html')
+
+@app.route('/search/', methods=['GET', 'POST'])
+def get_results():
+    global search
+    json_data = request.get_data(as_text=True)
+    data = json.loads(json_data)
+    query = data['query']
+    search = Search()
+    results = search.getResultDocs(query)
+    queryResult = list()
+    maxResultCount = int(data['page_count']) * 10
+    for result in results[maxResultCount - 10 : maxResultCount]:
+        queryResult.append(SearchResult(result, query))
+    
+    response = Response(response=json.dumps(queryResult, default=CommonUtilities.obj_dict), status=200, mimetype='application/json')
+    response.headers.add('Access-Control-Allow-Origin', '*')
+    return response
+
+@app.route('/relevance/', methods=["POST"])
+def relevance():
+    json_data = request.get_data(as_text=True)
+    data = json.loads(json_data)
+    # logger.info(json_data)
+    response = Response(response=json.dumps("OK", default=CommonUtilities.obj_dict), status=200, mimetype='application/json')
+    return response
+
+def setLogger():
+    global logger
+    logger = logging.getLogger('relevance')
+    logger.setLevel(logging.INFO)
+    ch = logging.FileHandler('Relevance.json')
+    ch.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(message)s')
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
+if __name__ == '__main__':
+    search = Search()
+    setLogger()
+    app.run(debug=True)
\ No newline at end of file
diff --git a/SearchResult.py b/SearchResult.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25e2c855a1031f107cb6368244a47850f65237a
--- /dev/null
+++ b/SearchResult.py
@@ -0,0 +1,21 @@
+class SearchResult:
+    def __init__(self, result, query):
+        self.id = 'paperid' + str(result['paper'][0][:8].encode('utf-8')).replace(" ", "")
+        self.title = result['title']
+        self.abstract = self.highlightQuery(result['abstract'], query)
+        self.topics = result['topics']
+        self.url = self.getUrlFormat(result['paper'][0])
+    
+    def getUrlFormat(self, file):
+        return "http://www.aclweb.org/anthology/" + file[0:8] + ".pdf"
+
+    def highlightQuery(self, text, query):
+        keywords = [w.lower() for w in query.split()]
+        words = text.split()
+
+        for i, word in enumerate(words):
+            for keyword in keywords:
+                if keyword in word.lower():
+                    words[i] = '<b>' + word + '</b>'
+                    break
+        return ' '.join(words).rstrip("\"").rstrip(".") + "."
\ No newline at end of file
diff --git a/corpus_processor.py b/corpus_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..97db47b47972d8e929ad874c86cbf2eaa4cb75e8
--- /dev/null
+++ b/corpus_processor.py
@@ -0,0 +1,99 @@
+import nltk
+from toolz.functoolz import partial
+from nltk.stem.porter import PorterStemmer
+
+
+class CorpusBaseProcessor(object):
+    """
+    Class that processes a corpus
+    """
+    def transform(self, docs):
+        """
+        Parameter:
+        -----------
+        docs: list of (string|list of tokens)
+            input corpus
+        
+        Return:
+        ----------
+        list of (string|list of tokens):
+            transformed corpus
+        """
+        raise NotImplemented
+
+
+class CorpusWordLengthFilter(CorpusBaseProcessor):
+    def __init__(self, minlen=2, maxlen=35):
+        self._min = minlen
+        self._max = maxlen
+
+    def transform(self, docs):
+        """
+        Parameters:
+        ----------
+        docs: list of list of str
+            the tokenized corpus
+        """
+        assert isinstance(docs[0], list)
+        valid_length = (lambda word:
+                        len(word) >= self._min and
+                        len(word) <= self._max)
+        filter_tokens = partial(filter, valid_length)
+        return list(map(filter_tokens, docs))
+    
+
+porter_stemmer = PorterStemmer()
+
+
+class CorpusStemmer(CorpusBaseProcessor):
+    def __init__(self, stem_func=porter_stemmer.stem):
+        """
+        Parameter:
+        --------------
+        stem_func: function that accepts one token and stem it
+        """
+        self._stem_func = stem_func
+
+    def transform(self, docs):
+        """
+        Parameter:
+        -------------
+        docs: list of list of str
+            the documents
+
+        Return:
+        -------------
+        list of list of str: the stemmed corpus
+        """
+        assert isinstance(docs[0], list)
+        stem_tokens = partial(map, self._stem_func)
+        docs = [[porter_stemmer.stem(token) for token in doc] for doc in docs]
+        return docs
+
+
+class CorpusPOSTagger(CorpusBaseProcessor):
+    def __init__(self, pos_tag_func=nltk.pos_tag):
+        """
+        Parameter:
+        --------------
+        pos_tag_func: pos_tag function that accepts list of tokens
+            and POS tag them
+        """
+        self._pos_tag_func = pos_tag_func
+
+    def transform(self, docs):
+        """
+        Parameter:
+        -------------
+        docs: list of list of str
+            the documents
+
+        Return:
+        -------------
+        list of list of str: the tagged corpus
+        """
+        assert isinstance(docs[0], list)
+        import nltk
+        nltk.download('averaged_perceptron_tagger')
+        docs = [nltk.pos_tag(doc) for doc in docs]
+        return docs
diff --git a/data.py b/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eef8d0dcfd7419105d865177bf52466c9c81470
--- /dev/null
+++ b/data.py
@@ -0,0 +1,45 @@
+import os
+import nltk
+import itertools
+import codecs
+from toolz.functoolz import compose
+import _pickle as pickle
+
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+
+def load_line_corpus(path, tokenize=True):
+    import nltk
+    nltk.download('punkt')
+    docs = []
+    with codecs.open(path, "r", "utf8") as f:
+        for l in f:
+            if tokenize:
+                sents = nltk.sent_tokenize(l.strip().lower())
+                docs.append(list(itertools.chain(*map(
+                    nltk.word_tokenize, sents))))
+            else:
+                docs.append(l.strip())
+    return docs
+
+
+def load_nips(years=None, raw=False):
+    # load data
+    if not years:
+        years = xrange(2008, 2015)
+    files = ['nips-{}.dat'.format(year)
+             for year in years]
+
+    docs = []
+    for f in files:
+        docs += load_line_corpus('{}/datasets/{}'.format(CURDIR, f),
+                                 tokenize=(not raw))
+        
+    return docs                
+
+
+def load_lemur_stopwords():
+    with codecs.open(CURDIR + '/datasets/lemur-stopwords.txt', 
+                     'r', 'utf8') as f:
+        return map(lambda s: s.strip(),
+                   f.readlines())
diff --git a/index.html b/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f2300007cf9ec643708bddbe20dbe7e5ac67ffa3
--- /dev/null
+++ b/index.html
@@ -0,0 +1,105 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<title>Search Engine for Research Papers</title>
+<meta charset="utf-8">
+<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+<script>
+    var page_count
+    var ip_address
+    var URL="http://127.0.0.1:5000/"
+    
+    function relevant(relevance, paperId) {
+        $('input[name="'+paperId+'"]').attr('disabled', 'disabled');
+        $.ajax({
+        type: 'POST',
+        url: URL + "relevance/",  //add endpoint API
+        data: JSON.stringify({query: query, relevance: relevance, id: paperId, ip_address: ip_address}),
+        });
+        
+    }
+   
+    function search(){
+       
+        $.ajax({
+            type: "POST",
+            url: URL + "search/",   
+            data: JSON.stringify({query: query, page_count: page_count}),
+            contentType: "applicaton/json; charset=utf-8",
+            success: function(results){
+                var resultLength = results.length
+                
+                for (var i=0; i < resultLength; ++i)
+                {
+                    var paperId = results[i].id
+                    $(".searchResults").append("<br>");
+                    $(".searchResults").append("<table>");
+                    $(".searchResults").append("<tr><td>");
+                    $(".searchResults").append("<p style=\"font-size:18px;\">" + "<a target=\"_blank\" href=\"" + results[i].url + "\">"+results[i].title+"</a>" + "</p>");
+                    $(".searchResults").append("</td></tr>");
+                    $(".searchResults").append("<tr><td>");
+                    $(".searchResults").append("<input type=\"radio\" name=\""+paperId+"\" onclick=\"relevant(true, name)\"><label> Relevant</label>&nbsp")
+                    $(".searchResults").append("&nbsp<input type=\"radio\" name=\""+paperId+"\" onclick=\"relevant(false, name)\"><label> Non-Relevant</label>") 
+                    $(".searchResults").append("</td></tr>");
+                    $(".searchResults").append("<tr><td>");
+                    $(".searchResults").append("<p style=\"border-style: groove;font-size:14px;\"> <b>Topic labels: </b>"+ results[i].topics +"</p>");
+                    $(".searchResults").append("</td></tr>");
+                    $(".searchResults").append("<pre style=\"white-space: pre-wrap;\">"+results[i].abstract+"</pre>")
+                    $(".searchResults").append("</td></tr>");
+                    $(".searchResults").append("</table>");
+                    
+                    
+                }
+                if(resultLength == 0) {
+                    $(".searchResults").append("<br>");
+                    $(".searchResults").append("<p style=\"font-size:18px; color:Red\"> There are no (more) results to show for the given query </p>");
+                }
+                if (resultLength < 10) 
+                    $("#more").hide();
+                else 
+                    $("#more").show();
+    
+            }
+        });
+        
+        function callback(response){
+            console.log(response);
+        }
+    }
+    $(document).ready(function(){
+            $("#searchButton").click(function(){
+                    $(".searchResults").empty()
+                    page_count = 1
+                    query = document.getElementById("querytext").value;
+                    search();
+            });
+            $("#more").click(function(){
+                    page_count++
+                    query = document.getElementById("querytext").value;
+                    search();
+            });
+
+            $.getJSON('https://api.ipify.org?format=json', function(data){
+                ip_address = data.ip;
+        });
+        
+           
+    });
+</script>
+
+</head>
+
+<body>
+<center>
+<h2>Find Research Papers</h2>
+
+<input id="querytext" type="text" placeholder="Search.." size=80/>
+
+<button type="button" id="searchButton"> Search </button> 
+
+<div class="searchResults"> </div>
+<br><br>
+<button type="button" id="more" style="display:none;"><b>More results</b></button>
+</center>
+</body>
+</html> 
diff --git a/label_finder.py b/label_finder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2a6643496cb27b46705fab7271ebebb0d90119
--- /dev/null
+++ b/label_finder.py
@@ -0,0 +1,81 @@
+import nltk
+from nltk.collocations import BigramCollocationFinder
+from toolz.itertoolz import get
+from toolz.functoolz import partial
+
+
+class BigramLabelFinder(object):
+    def __init__(self, measure='pmi',
+                 min_freq=20,
+                 pos=[('NN', 'NN'), ('JJ', 'NN')]):
+        """
+        measure: str
+            the measurement method, 'pmi'or 'chi_sq'
+
+        min_freq: int
+            minimal frequency for the label to be considered
+
+        pos: list of (str, str)
+            the POS tag contraint
+        """
+        self.bigram_measures = nltk.collocations.BigramAssocMeasures()
+        assert measure in ('pmi', 'chi_sq')
+        self._measure_method = measure
+
+        self._min_freq = min_freq
+        self._pos = pos
+        
+    def find(self, docs, top_n, strip_tags=True):
+        """
+        Parameter:
+        ---------------
+
+        docs: list of tokenized documents
+            
+        top_n: int
+            how many labels to return
+
+        strip_tags: bool
+            whether return without the POS tags or not
+
+        Return:
+        ---------------
+        list of tuple of str: the bigrams
+        """
+        # if apply pos constraints
+        # check the pos properties
+        if self._pos:
+            assert isinstance(self._pos, list)
+            for pair in self._pos:
+                assert isinstance(pair, tuple) or isinstance(pair, list)
+                assert len(pair) == 2  # because it's bigram
+
+        score_func = getattr(self.bigram_measures,
+                             self._measure_method)
+
+        finder = BigramCollocationFinder.from_documents(docs)
+        finder.apply_freq_filter(self._min_freq)
+
+        if self._pos:
+            valid_pos_tags = set([pair for pair in self._pos])
+            valid_bigrams = []
+            bigrams = map(partial(get, 0),  # get the bigram
+                          finder.score_ngrams(score_func))
+            cnt = 0
+            for bigram in bigrams:
+                if tuple(map(partial(get, 1), bigram)) in valid_pos_tags:
+                    valid_bigrams.append(bigram)
+                    cnt += 1
+                if cnt == top_n:  # enough
+                    break
+
+            if strip_tags:
+                valid_bigrams = [tuple(map(partial(get, 0), bigram))
+                                 for bigram in valid_bigrams]
+
+            return valid_bigrams
+        else:
+            bigrams = finder.nbest(score_func,
+                                   top_n)
+            return bigrams
+            
diff --git a/label_ranker.py b/label_ranker.py
new file mode 100644
index 0000000000000000000000000000000000000000..e61664ad7647792a9e5e539a955a158e2b3c38ef
--- /dev/null
+++ b/label_ranker.py
@@ -0,0 +1,233 @@
+"""
+Reference:
+---------------------
+
+Qiaozhu Mei, Xuehua Shen, Chengxiang Zhai,
+Automatic Labeling of Multinomial Topic Models, 2007
+"""
+import numpy as np
+from scipy.stats import entropy as kl_divergence
+
+
+class LabelRanker(object):
+    """
+    
+    """
+    def __init__(self,
+                 apply_intra_topic_coverage=True,
+                 apply_inter_topic_discrimination=True,
+                 mu=0.7,
+                 alpha=0.9):
+        self._coverage = apply_intra_topic_coverage
+        self._discrimination = apply_inter_topic_discrimination
+        self._mu = mu
+        self._alpha = alpha
+
+    def label_relevance_score(self,
+                              topic_models,
+                              pmi_w2l):
+        """
+        Calculate the relevance scores between each label and each topic
+
+        Parameters:
+        ---------------
+        topic_models: numpy.ndarray(#topics, #words)
+           the topic models
+
+        pmi_w2l: numpy.ndarray(#words, #labels)
+           the Point-wise Mutual Information(PMI) table of
+           the form, PMI(w, l | C)
+        
+        Returns;
+        -------------
+        numpy.ndarray, shape (#topics, #labels)
+            the scores of each label on each topic
+        """
+        assert topic_models.shape[1] == pmi_w2l.shape[0]
+        return np.asarray(np.asmatrix(topic_models) *
+                          np.asmatrix(pmi_w2l))
+        
+    def label_discriminative_score(self,
+                                   relevance_score,
+                                   topic_models,
+                                   pmi_w2l):
+        """
+        Calculate the discriminative scores for each label
+        
+        Returns:
+        --------------
+        numpy.ndarray, shape (#topics, #labels)
+            the (i, j)th element denotes the score
+            for label j and all topics *except* the ith
+        """
+        assert topic_models.shape[1] == pmi_w2l.shape[0]
+        k = topic_models.shape[0]
+        return (relevance_score.sum(axis=0)[None, :].repeat(repeats=k, axis=0)
+                - relevance_score) / (k-1)
+        
+    def label_mmr_score(self,
+                        which_topic,
+                        chosen_labels,
+                        label_scores,
+                        label_models):
+        """
+        Maximal Marginal Relevance score for labels.
+        It's computed only when `apply_intra_topic_coverage` is True
+
+        Parameters:
+        --------------
+        which_topic: int
+            the index of the topic
+        
+        chosen_labels: list<int>
+           indices of labels that are already chosen
+        
+        label_scores: numpy.ndarray<#topic, #label>
+           label scores for each topic
+
+        label_models: numpy.ndarray<#label, #words>
+            the language models for labels
+
+        Returns:
+        --------------
+        numpy.ndarray: 1D of length #label - #chosen_labels
+            the scored label indices
+
+        numpy.ndarray: same length as above
+            the scores
+        """
+        chosen_len = len(chosen_labels)
+        if chosen_len == 0:
+            # no label is chosen
+            # return the raw scores
+            return (np.arange(label_models.shape[0]),
+                    label_scores[which_topic, :])
+        else:
+            kl_m = np.zeros((label_models.shape[0]-chosen_len,
+                             chosen_len))
+            
+            # the unchosen label indices
+            candidate_labels = list(set(range(label_models.shape[0])) -
+                                    set(chosen_labels))
+            candidate_labels = np.sort(np.asarray(candidate_labels))
+            for i, l_p in enumerate(candidate_labels):
+                for j, l in enumerate(chosen_labels):
+                    kl_m[i, j] = kl_divergence(label_models[l_p],
+                                               label_models[l])
+            sim_scores = kl_m.max(axis=1)
+            mml_scores = (self._alpha *
+                          label_scores[which_topic, candidate_labels]
+                          - (1 - self._alpha) * sim_scores)
+            return (candidate_labels, mml_scores)
+
+    def combined_label_score(self, topic_models, pmi_w2l,
+                             use_discrimination, mu=None):
+        """
+        Calculate the combined scores from relevance_score
+        and discrimination_score(if required)
+
+        Parameter:
+        -----------
+        use_discrimination: bool
+            whether use discrimination or not
+        mu: float
+            the `mu` parameter in the algorithm
+
+        Return:
+        -----------
+        numpy.ndarray, shape (#topics, #labels)
+            score for each topic and label pair
+        """
+        rel_scores = self.label_relevance_score(topic_models, pmi_w2l)
+        
+        if use_discrimination:
+            assert mu != None
+            discrim_scores = self.label_discriminative_score(rel_scores,
+                                                             topic_models,
+                                                             pmi_w2l)
+            label_scores = rel_scores - mu * discrim_scores
+        else:
+            label_scores = rel_scores
+
+        return label_scores
+
+    def select_label_sequentially(self, k_labels,
+                                  label_scores, label_models):
+        """
+        Return:
+        ------------
+        list<list<int>>: shape n_topics x k_labels
+        """
+        n_topics = label_scores.shape[0]
+        chosen_labels = []
+
+        # don't use [[]] * n_topics !
+        for _ in xrange(n_topics):
+            chosen_labels.append(list())
+            
+        for i in xrange(n_topics):
+            for j in xrange(k_labels):
+                inds, scores = self.label_mmr_score(i, chosen_labels[i],
+                                                    label_scores,
+                                                    label_models)
+                chosen_labels[i].append(inds[np.argmax(scores)])
+        return chosen_labels
+
+    def top_k_labels(self,
+                     topic_models,
+                     pmi_w2l,
+                     index2label,
+                     label_models=None,
+                     k=5):
+        """
+        Parameters:
+        ----------------
+        
+        index2label: dict<int, object>
+           mapping from label index in the `pmi_w2l`
+           to the label object, which can be string
+
+        label_models: numpy.ndarray<#label, #words>
+            the language models for labels
+            if `apply_intra_topic_coverage` is True,
+            then it's must be given
+
+        Return:
+        ---------------
+        list<list of (label, float)>
+           top k labels as well as scores for each topic model
+
+        """
+
+        assert pmi_w2l.shape[1] == len(index2label)
+
+        label_scores = self.combined_label_score(topic_models, pmi_w2l,
+                                                 self._discrimination,
+                                                 self._mu)
+
+        if self._coverage:
+            assert isinstance(label_models, np.ndarray)
+            # TODO: can be parallel
+            chosen_labels = self.select_label_sequentially(k, label_scores,
+                                                           label_models)
+        else:
+            chosen_labels = np.argsort(label_scores, axis=1)[:, :-k-1:-1]
+        return [[index2label[j]
+                 for j in topic_i_labels]
+                for topic_i_labels in chosen_labels]
+            
+    def print_top_k_labels(self, topic_models, pmi_w2l,
+                           index2label, label_models, k):
+        res = u"Topic labels:\n"
+        for i, labels in enumerate(self.top_k_labels(
+                topic_models=topic_models,
+                pmi_w2l=pmi_w2l,
+                index2label=index2label,
+                label_models=label_models,
+                k=k)):
+            res += u"Topic {}: {}\n".format(
+                i,
+                ', '.join(map(lambda l: ' '.join(l),
+                              labels))
+            )
+        return res
diff --git a/label_topic.py b/label_topic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9692fa4d65e68b288c73c2dcc9cd984cf9544629
--- /dev/null
+++ b/label_topic.py
@@ -0,0 +1,129 @@
+import argparse
+import lda
+import itertools
+import numpy as np
+import nltk
+from sklearn.feature_extraction.text import (CountVectorizer
+                                             as WordCountVectorizer)
+from text import LabelCountVectorizer
+from label_finder import BigramLabelFinder
+from label_ranker import LabelRanker
+from pmi import PMICalculator
+from corpus_processor import (CorpusWordLengthFilter,
+                                       CorpusPOSTagger,
+                                       CorpusStemmer)
+from data import (load_line_corpus, load_lemur_stopwords)
+
+class TopicLabels:
+    def get_topic_labels(doc, 
+                        n_topics,
+                        n_top_words,
+                        preprocessing_steps,
+                        n_cand_labels, label_min_df,
+                        label_tags, n_labels,
+                        lda_random_state,
+                        lda_n_iter):
+        """
+        Refer the arguments to `create_parser`
+        """
+
+        docs = []
+        for d in doc:
+            sents = nltk.sent_tokenize(d.strip().lower())
+            docs.append(list(itertools.chain(*map(nltk.word_tokenize, sents))))
+        if 'wordlen' in preprocessing_steps:
+            print("Word length filtering...")
+            wl_filter = CorpusWordLengthFilter(minlen=3)
+            docs = wl_filter.transform(docs)
+
+        if 'stem' in preprocessing_steps:
+            print("Stemming...")
+            stemmer = CorpusStemmer()
+            docs = stemmer.transform(docs)
+
+        if 'tag' in preprocessing_steps:
+            print("POS tagging...")
+            tagger = CorpusPOSTagger()
+            tagged_docs = tagger.transform(docs)
+
+        tag_constraints = []
+        if label_tags != ['None']:
+            for tags in label_tags:
+                tag_constraints.append(tuple(map(lambda t: t.strip(),
+                                                tags.split(','))))
+
+        if len(tag_constraints) == 0:
+            tag_constraints = None
+
+        print("Tag constraints: {}".format(tag_constraints))
+
+        print("Generate candidate bigram labels(with POS filtering)...")
+        finder = BigramLabelFinder('pmi', min_freq=label_min_df,
+                                pos=tag_constraints)
+        if tag_constraints:
+            assert 'tag' in preprocessing_steps, \
+                'If tag constraint is applied, pos tagging(tag) should be performed'
+            cand_labels = finder.find(tagged_docs, top_n=n_cand_labels)
+        else:  # if no constraint, then use untagged docs
+            cand_labels = finder.find(docs, top_n=n_cand_labels)
+
+        print("Collected {} candidate labels".format(len(cand_labels)))
+
+        print("Calculate the PMI scores...")
+
+        pmi_cal = PMICalculator(
+            doc2word_vectorizer=WordCountVectorizer(
+                min_df=5,
+                stop_words=load_lemur_stopwords()),
+            doc2label_vectorizer=LabelCountVectorizer())
+
+        pmi_w2l = pmi_cal.from_texts(docs, cand_labels)
+
+        print("Topic modeling using LDA...")
+        model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter,
+                        random_state=lda_random_state)
+        model.fit(pmi_cal.d2w_)
+
+        print("\nDocument coverage:")
+        doc_topic = model.doc_topic_
+        doc_topics = {}
+        for i in range(100):
+            doc_topics[i] = doc_topic[i].argmax()
+            # print("{} (top topic: {})".format(i, doc_topic[i].argmax()))
+
+        print("\nTopical words:")
+        print("-" * 20)
+        for i, topic_dist in enumerate(model.topic_word_):
+            top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
+            topic_words = [pmi_cal.index2word_[id_]
+                        for id_ in top_word_ids]
+            # print('Topic {}: {}'.format(i, ' '.join(topic_words)))
+
+        ranker = LabelRanker(apply_intra_topic_coverage=False)
+
+        return doc_topics, ranker.top_k_labels(topic_models=model.topic_word_,
+                                pmi_w2l=pmi_w2l,
+                                index2label=pmi_cal.index2label_,
+                                label_models=None,
+                                k=n_labels)
+        
+    # if __name__ == '__main__':
+        # labels = get_topic_labels(docs = [],
+        #                         n_topics=10,
+        #                         n_top_words=20,
+        #                         preprocessing_steps=['wordlen', 'stem', 'tag'],
+        #                         n_cand_labels=100,
+        #                         label_min_df=5,
+        #                         label_tags=['NN,NN', 'JJ,NN'],
+        #                         n_labels=10,
+        #                         lda_random_state=12345,
+        #                         lda_n_iter=400)
+        
+        # print("\nTopical labels:")
+        # print("-" * 20)
+        # for i, labels in enumerate(labels):
+        #     print(u"Topic {}: {}\n".format(
+        #         i,
+        #         ', '.join(map(lambda l: ' '.join(l), labels))
+        #     ))
+
diff --git a/neuralnetregressor.py b/neuralnetregressor.py
new file mode 100644
index 0000000000000000000000000000000000000000..66449cfc06255ade9bde8987ef3c29045cd333f5
--- /dev/null
+++ b/neuralnetregressor.py
@@ -0,0 +1,71 @@
+from sklearn.linear_model import LinearRegression
+from sklearn.neural_network import MLPRegressor
+from sklearn import preprocessing as pre
+from math import sin
+import numpy as np
+import csv
+from collections import defaultdict
+
+def relevanceScore(intercept, coefs, scores):
+    relScore = intercept
+    for index, score in enumerate(scores):
+        relScore += (float(score) * coefs[index])
+    return relScore
+
+trainData = list()
+f = open("./supervisedTrain.txt", "r")
+for line in f:
+    words = line.split(",")
+    trainData.append(words)
+
+trainData = np.array(trainData)
+trainRel = np.array(trainData[:,0], dtype='float')
+trainFeatures = np.array(trainData[:,1:-1], dtype='float')
+
+scaler = pre.StandardScaler()
+trainFeaturesScaled = scaler.fit_transform(trainFeatures)
+
+testData = list()
+tf = open("./neuralNetFeaturesTest.txt", "r")
+for line in tf:
+    words = line.split(",")
+    testData.append(words)
+
+# Train model....Got good results for 3 hidden layers with regression data
+mlp = MLPRegressor(hidden_layer_sizes=(3, 3),
+                    activation='tanh',
+                    solver='adam',
+                    learning_rate='invscaling',
+                    max_iter=1000,
+                    learning_rate_init=0.001,
+                    alpha=0.001,
+                    random_state=0,
+                    shuffle=True)
+mlp.fit(trainFeaturesScaled, trainRel)
+
+testData = np.array(testData)
+testDataQid = testData[:,0]
+testDataDocNo = testData[:,-1]
+testDataFeatures = np.array(testData[:,1:-1], dtype='float')
+testDataFeaturesScaled = scaler.fit_transform(testDataFeatures)
+testDataRel = mlp.predict(testDataFeaturesScaled)
+
+
+testDict = defaultdict(list)
+for index, rel in enumerate(testDataRel):
+    temp = [rel, testDataDocNo[index]]
+    testDict[testDataQid[index]].append(temp)
+
+tf = open("./NeuralNetRegressionFeaturesTrainResults.txt", "w")
+finalDict = defaultdict(list)
+for key, value in testDict.items():
+    value.sort(key=lambda x:x[0], reverse=True)
+    finalDict[key] = value[:100]
+
+for key, value in finalDict.items():
+    for v in value: 
+        tf.write(key + "\t" + v[1][:-1] + "\t" + str(v[0]) + "\n" )
+
+# TODO Add evaluation methods to be invoked on test data set results.
+from joblib import dump, load
+dump(mlp, "plsaNeuralNetModel.model")
\ No newline at end of file
diff --git a/passenger_wsgi.py b/passenger_wsgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..8056eb0af381397ee9d3983ac29ffdca093758b6
--- /dev/null
+++ b/passenger_wsgi.py
@@ -0,0 +1 @@
+# from SearchEngine import app as application
\ No newline at end of file
diff --git a/plsaNeuralNetModel.model b/plsaNeuralNetModel.model
new file mode 100644
index 0000000000000000000000000000000000000000..555fa1568b3b1386dc40ce22e22c76d0a153b8d0
Binary files /dev/null and b/plsaNeuralNetModel.model differ
diff --git a/pmi.py b/pmi.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7cd6585d49accd1ab61d082c9bb061144614e91
--- /dev/null
+++ b/pmi.py
@@ -0,0 +1,111 @@
+import numpy as np
+from scipy.sparse import issparse
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+
+class PMICalculator(object):
+    """
+    Parameter:
+    -----------
+    doc2word_vectorizer: object that turns list of text into doc2word matrix
+        for example, sklearn.feature_extraction.test.CountVectorizer
+    """
+    def __init__(self, doc2word_vectorizer=None,
+                 doc2label_vectorizer=None):
+        self._d2w_vect = doc2word_vectorizer
+        self._d2l_vect = doc2label_vectorizer
+
+        self.index2word_ = None
+        self.index2label_ = None
+        
+    def from_matrices(self, d2w, d2l, pseudo_count=1):
+        """
+        Parameter:
+        ------------
+        d2w: numpy.ndarray or scipy.sparse.csr_matrix
+            document-word frequency matrix
+        
+        d2l: numpy.ndarray or scipy.sparse.csr_matrix
+            document-label frequency matrix
+            type should be the same with `d2w`
+
+        pseudo_count: float
+            smoothing parameter to avoid division by zero
+
+        Return:
+        ------------
+        numpy.ndarray: #word x #label
+            the pmi matrix
+        """        
+        denom1 = d2w.T.sum(axis=1)
+        denom2 = d2l.sum(axis=0)
+
+        # both are dense
+        if (not issparse(d2w)) and (not issparse(d2l)):
+            numer = np.matrix(d2w.T > 0) * np.matrix(d2l > 0)
+            denom1 = denom1[:, None]
+            denom2 = denom2[None, :]
+        # both are sparse
+        elif issparse(d2w) and issparse(d2l):
+            numer = ((d2w.T > 0) * (d2l > 0)).todense()
+        else:
+            raise TypeError('Type inconsistency: {} and {}.\n' +
+                            'They should be the same.'.format(
+                                type(d2w), type(d2l)))
+
+        # dtype conversion
+        numer = np.asarray(numer, dtype=np.float64)
+        denom1 = np.asarray(
+            denom1.repeat(repeats=d2l.shape[1], axis=1),
+            dtype=np.float64)
+        denom2 = np.asarray(
+            denom2.repeat(repeats=d2w.shape[1], axis=0),
+            dtype=np.float64)
+
+        # smoothing
+        numer += pseudo_count
+
+        return np.log(d2w.shape[0] * numer / denom1 / denom2)
+
+    def from_texts(self, docs, labels):
+        """
+        Parameter:
+        -----------
+        docs: list of list of string
+            the tokenized documents
+
+        labels: list of list of string
+        
+        Return:
+        -----------
+        numpy.ndarray: #word x #label
+            the pmi matrix
+        """
+        d2w = self._d2w_vect.fit_transform(map(lambda sent: ' '.join(sent),
+                                               docs))
+
+        # save it to avoid re-computation
+        self.d2w_ = d2w
+
+        d2l = self._d2l_vect.transform(docs, labels)
+
+        # remove the labels without any occurrences
+        indices = np.asarray(d2l.sum(axis=0).nonzero()[1]).flatten()
+        d2l = d2l[:, indices]
+
+        indices = set(indices)
+        labels = [l
+                  for i, l in self._d2l_vect.index2label_.items()
+                  if i in indices]
+
+        self.index2label_ = {i: l
+                             for i, l in enumerate(labels)}
+
+        if len(self.index2label_) == 0:
+            logging.warn("After label filtering, there is nothing left.")
+
+        self.index2word_ = {i: w
+                            for w, i in self._d2w_vect.vocabulary_.items()}
+        return self.from_matrices(d2w, d2l)
diff --git a/text.py b/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5a277f68739809e4c824fb72ab07c6d5bb91fc2
--- /dev/null
+++ b/text.py
@@ -0,0 +1,74 @@
+from scipy.sparse import (csr_matrix, lil_matrix)
+from scipy import int64
+
+
+class LabelCountVectorizer(object):
+    """
+    Count the frequency of labels in each document
+    """
+    
+    def __init__(self):
+        self.index2label_ = None
+        
+    def _label_frequency(self, label_tokens, context_tokens):
+        """
+        Calculate the frequency that the label appears
+        in the context(e.g, sentence)
+        
+        Parameter:
+        ---------------
+
+        label_tokens: list|tuple of str
+            the label tokens
+        context_tokens: list|tuple of str
+            the sentence tokens
+
+        Return:
+        -----------
+        int: the label frequency in the sentence
+        """
+        label_len = len(label_tokens)
+        cnt = 0
+        for i in range(len(context_tokens) - label_len + 1):
+            match = True
+            for j in range(label_len):
+                if label_tokens[j] != context_tokens[i+j]:
+                    match = False
+                    break
+            if match:
+                cnt += 1
+        return cnt
+
+    def transform(self, docs, labels):
+        """
+        Calculate the doc2label frequency table
+
+        Note: docs are not tokenized and frequency is computed
+            based on substring matching
+        
+        Parameter:
+        ------------
+
+        docs: list of list of string
+            tokenized documents
+
+        labels: list of list of string
+
+        Return:
+        -----------
+        scipy.sparse.csr_matrix: #doc x #label
+            the frequency table
+        """
+        labels = sorted(labels)
+        self.index2label_ = {index: label
+                             for index, label in enumerate(labels)}
+
+        ret = lil_matrix((len(docs), len(labels)),
+                         dtype=int64)
+        for i, d in enumerate(docs):
+            for j, l in enumerate(labels):
+                cnt = self._label_frequency(l, d)
+                if cnt > 0:
+                    ret[i, j] = cnt
+        return ret.tocsr()
+
diff --git a/train_academicdata.py b/train_academicdata.py
new file mode 100644
index 0000000000000000000000000000000000000000..a50ae679137af506b4be6f2c8388f9e12df6e5ea
--- /dev/null
+++ b/train_academicdata.py
@@ -0,0 +1,284 @@
+import json
+import xml.etree.ElementTree as ET
+from whoosh.index import create_in, open_dir
+from whoosh.fields import *
+from whoosh.qparser import MultifieldParser, OrGroup, QueryParser
+from whoosh.scoring import BaseScorer
+from whoosh import scoring
+from collections import defaultdict
+
+class Myclass:
+    def __init__(self):
+        self.rel = 0
+        self.kpbm25 = 0.0
+        self.pabm25 = 0.0
+        self.tibm25 = 0.0
+        self.kptf = 0.0
+        self.patf = 0.0
+        self.titf = 0.0
+        self.kppl2 = 0.0
+        self.papl2 = 0.0
+        self.tipl2 = 0.0
+        self.kpdf = 0.0
+        self.padf = 0.0
+        self.tidf = 0.0
+        self.references = 0
+        self.citatedBy = 0
+        self.citations = 0
+
+
+count = 0
+
+rf = open("./supervisedTrain.txt", "w")
+f = open("./train_queries.json", encoding = "utf-8")
+relevance = open("./train_queries_qrel", encoding = "utf-8")
+train_queries = defaultdict(dict)
+for line in relevance:
+    words = line.split()
+    train_queries[words[0]][words[1]] = words[2]
+
+
+fileWrite = defaultdict(dict)
+queries = defaultdict(str)
+for line in f:
+        document = json.loads(line)
+        query = document["query"]
+        query = query.replace("#combine( ", '')
+        query = query.replace(')', '')
+        queries[document["qid"]] = query
+        indexDir = open_dir("index/")
+        og = OrGroup.factory(0.9)
+        #QueryParser("content", schema)
+        #queryParser = MultifieldParser(["keyPhrases", "paperAbstract", "title"], indexSearcher.schema, group=og)
+        indexSearcher = indexDir.searcher()
+        kpqueryParser = QueryParser("keyPhrases", indexSearcher.schema, group=og)
+        kpqueryObject = kpqueryParser.parse(query)
+        paqueryParser = QueryParser("paperAbstract", indexSearcher.schema, group=og)
+        paqueryObject = paqueryParser.parse(query)
+        tiqueryParser = QueryParser("title", indexSearcher.schema, group=og)
+        tiqueryObject = tiqueryParser.parse(query)
+
+        
+        kpresults = indexSearcher.search(kpqueryObject, limit = None)
+        paresults = indexSearcher.search(paqueryObject, limit = None)
+        tiresults = indexSearcher.search(tiqueryObject, limit = None)
+        for result in kpresults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].kpbm25 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.kpbm25 = result.score
+                    fileWrite[str(count)][docNo] = temp
+        
+        for result in paresults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].pabm25 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.pabm25 = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        for result in tiresults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].tibm25 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.tibm25 = result.score
+                    fileWrite[str(count)][docNo] = temp
+        
+        
+        w = scoring.TF_IDF()
+        idfIndexSearcher = indexDir.searcher(weighting=w)
+        #idfResults = idfIndexSearcher.search(queryObject, limit = 8541)
+        kpidfResults = idfIndexSearcher.search(kpqueryObject, limit = None)
+        paidfResults = idfIndexSearcher.search(paqueryObject, limit = None)
+        tiidfResults = idfIndexSearcher.search(tiqueryObject, limit = None)
+        for result in kpidfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].kptf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.kptf = result.score
+                    fileWrite[str(count)][docNo] = temp
+        
+        for result in paidfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].patf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.patf = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        for result in tiidfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].titf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.titf = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        w = scoring.PL2()
+        plIndexSearcher = indexDir.searcher(weighting=w)
+        #plResults = plIndexSearcher.search(queryObject, limit = 8541)
+        kpplResults = plIndexSearcher.search(kpqueryObject, limit = None)
+        paplResults = plIndexSearcher.search(paqueryObject, limit = None)
+        tiplResults = plIndexSearcher.search(tiqueryObject, limit = None)
+
+        for result in kpplResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].kppl2 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.kppl2 = result.score
+                    fileWrite[str(count)][docNo] = temp
+        
+        for result in paplResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].papl2 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.papl2 = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        for result in tiplResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].tipl2 = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.tipl2 = result.score
+                    fileWrite[str(count)][docNo] = temp 
+
+        w = scoring.DFree()
+        dfIndexSearcher = indexDir.searcher(weighting=w)
+        #plResults = plIndexSearcher.search(queryObject, limit = 8541)
+        kpdfResults = dfIndexSearcher.search(kpqueryObject, limit = None)
+        padfResults = dfIndexSearcher.search(paqueryObject, limit = None)
+        tidfResults = dfIndexSearcher.search(tiqueryObject, limit = None)
+
+        for result in kpdfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].kpdf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.kpdf = result.score
+                    fileWrite[str(count)][docNo] = temp
+        
+        for result in padfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].padf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.padf = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        for result in tidfResults:
+            docNo = "".join(result["docno"])
+            rel = train_queries[str(count)].get(docNo, -1)
+
+            if rel != -1:
+                if (docNo in fileWrite[str(count)]):
+                    fileWrite[str(count)][docNo].tidf = result.score
+                else:
+                    temp = Myclass()
+                    temp.rel = rel
+                    temp.tidf = result.score
+                    fileWrite[str(count)][docNo] = temp
+
+        count += 1
+
+def getJsonValue(jsonValue, key):
+    try:
+        if key != "docno":
+            value = " ".join(jsonValue[key])
+        else:
+            value = jsonValue[key]
+        return value
+    except:
+        return " "
+
+docs = open("../data/Academic_data/docs.json", encoding="utf-8")
+docReferences = defaultdict()
+for line in docs:
+    document = json.loads(line)
+    kp = getJsonValue(document, "keyPhrases")
+    pa = getJsonValue(document, "paperAbstract")
+    ti = getJsonValue(document, "title")
+    docNo = document["docno"]
+    refs = document["numKeyReferences"][0]
+    citatedBy = document["numCitedBy"][0]
+    citations = document["numKeyCitations"][0]
+    docReferences[docNo] = [refs, citatedBy, citations, kp, pa, ti]
+
+for key, value in fileWrite.items():
+    for k, v in value.items():
+        kp = set(docReferences[k][3].split())
+        pa = set(docReferences[k][4].split())
+        ti = set(docReferences[k][5].split())
+        q = set(queries[key].split())
+        #rf.write(str(v[0]) + "\tqid:" + key + "\t1:" + str(v[1]) + "\t2:" + str(v[2]) + "\t3:" + str(v[3]) + "\t#docid = " + str(k) + "\n")
+        rf.write(str(v.rel) + "," + str(v.kpbm25) + "," + str(v.pabm25) + "," + str(v.tibm25) + "," + \
+            str(v.kptf) + "," + str(v.patf) + "," + str(v.titf)  + "," + \
+            str(v.kppl2) + "," + str(v.papl2) + "," + str(v.tipl2) + "," + \
+            str(v.kpdf) + "," + str(v.padf) + "," + str(v.tidf) + "," + \
+            str(docReferences[k][0]) + "," + str(docReferences[k][1]) + "," + str(docReferences[k][2]) + "," + \
+            str(len(kp)) + "," + str(len(pa)) + "," + str(len(ti)) + "," + \
+            str(len(q)) + "," + str(len(q.intersection(kp))) + "," + str(len(q.intersection(pa))) + "," + str(len(q.intersection(ti))) + "," + \
+            str(k) + "\n")
+f.close()
+print(count)
+