Fixed encoding error for computer classification.

Removed words causing encoding errors.

Fixed encoding error for computer classification.
b71b36d2 · hoaglan2 · cbad7835 · b71b36d2 · b71b36d2 · b71b36d2
Commit b71b36d2 authored 10 years ago by hoaglan2
--- a/Scripts/6_recallClassification_Bayes_New_Data.py
+++ b/Scripts/6_recallClassification_Bayes_New_Data.py
@@ -41,8 +41,16 @@ def selectFeatures(train_set, train_text, k):
    #words = [wnl.lemmatize(t, 'v') for t in words]
    print str(len(words))+' words found.'

+    clean_words = []
+    for word in words:
+        try:
+            v = nltk.pos_tag(word)
+            clean_words.append(word)
+        except:
+            continue
+
    # Tagging of the parts of speech
-    tagged_words = nltk.pos_tag(words)
+    tagged_words = nltk.pos_tag(clean_words)
    words = [word for (word, tag) in tagged_words if tag in ['NN','JJ','NNS','RB','VB','VBD','VBG','VBN','VBP','VBZ']]
    #words = set(words).intersection(set(unigrams+ bigrams+trigrams));
    # Get the frequency of words

--- a/Scripts/6_recallClassification_Bayes_New_Data.pyc
+++ b/Scripts/6_recallClassification_Bayes_New_Data.pyc
--- a/Scripts/7_automatedSteps.py
+++ b/Scripts/7_automatedSteps.py
@@ -10,7 +10,7 @@ import os

 #markers for which sections to execute
 pieces = {"Retrieve": False, "Unique": True, "ClassifyS1": True,
-        "ClassifyS2": False, "Procodes": True}
+        "ClassifyS2": True, "Procodes": True}

 '''
    SCRIPT 1 --> Retrieve the data
@@ -23,7 +23,7 @@ if(pieces["Retrieve"]):
    basepath = './../Original_Data';
    os.chdir(basepath)

-    #get data from 2005-2012
+    #get data from 2006-2012
    for Year in range(2007, 2008):
        print 'Year '+str(Year);
        startYear = Year;

--- a/Unique_Data/best_keywords.txt
+++ b/Unique_Data/best_keywords.txt
--- a/Unique_Data/unique2007_classified.xls
+++ b/Unique_Data/unique2007_classified.xls