Skip to content
Snippets Groups Projects
Commit b71b36d2 authored by hoaglan2's avatar hoaglan2
Browse files

Fixed encoding error for computer classification.

Removed words causing encoding errors.
parent cbad7835
No related branches found
No related tags found
No related merge requests found
......@@ -41,8 +41,16 @@ def selectFeatures(train_set, train_text, k):
#words = [wnl.lemmatize(t, 'v') for t in words]
print str(len(words))+' words found.'
clean_words = []
for word in words:
try:
v = nltk.pos_tag(word)
clean_words.append(word)
except:
continue
# Tagging of the parts of speech
tagged_words = nltk.pos_tag(words)
tagged_words = nltk.pos_tag(clean_words)
words = [word for (word, tag) in tagged_words if tag in ['NN','JJ','NNS','RB','VB','VBD','VBG','VBN','VBP','VBZ']]
#words = set(words).intersection(set(unigrams+ bigrams+trigrams));
# Get the frequency of words
......
No preview for this file type
......@@ -10,7 +10,7 @@ import os
#markers for which sections to execute
pieces = {"Retrieve": False, "Unique": True, "ClassifyS1": True,
"ClassifyS2": False, "Procodes": True}
"ClassifyS2": True, "Procodes": True}
'''
SCRIPT 1 --> Retrieve the data
......@@ -23,7 +23,7 @@ if(pieces["Retrieve"]):
basepath = './../Original_Data';
os.chdir(basepath)
#get data from 2005-2012
#get data from 2006-2012
for Year in range(2007, 2008):
print 'Year '+str(Year);
startYear = Year;
......
This diff is collapsed.
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment