Skip to content
Snippets Groups Projects
Commit b71b36d2 authored by hoaglan2's avatar hoaglan2
Browse files

Fixed encoding error for computer classification.

Removed words causing encoding errors.
parent cbad7835
Branches master
No related tags found
No related merge requests found
...@@ -41,8 +41,16 @@ def selectFeatures(train_set, train_text, k): ...@@ -41,8 +41,16 @@ def selectFeatures(train_set, train_text, k):
#words = [wnl.lemmatize(t, 'v') for t in words] #words = [wnl.lemmatize(t, 'v') for t in words]
print str(len(words))+' words found.' print str(len(words))+' words found.'
clean_words = []
for word in words:
try:
v = nltk.pos_tag(word)
clean_words.append(word)
except:
continue
# Tagging of the parts of speech # Tagging of the parts of speech
tagged_words = nltk.pos_tag(words) tagged_words = nltk.pos_tag(clean_words)
words = [word for (word, tag) in tagged_words if tag in ['NN','JJ','NNS','RB','VB','VBD','VBG','VBN','VBP','VBZ']] words = [word for (word, tag) in tagged_words if tag in ['NN','JJ','NNS','RB','VB','VBD','VBG','VBN','VBP','VBZ']]
#words = set(words).intersection(set(unigrams+ bigrams+trigrams)); #words = set(words).intersection(set(unigrams+ bigrams+trigrams));
# Get the frequency of words # Get the frequency of words
......
No preview for this file type
...@@ -10,7 +10,7 @@ import os ...@@ -10,7 +10,7 @@ import os
#markers for which sections to execute #markers for which sections to execute
pieces = {"Retrieve": False, "Unique": True, "ClassifyS1": True, pieces = {"Retrieve": False, "Unique": True, "ClassifyS1": True,
"ClassifyS2": False, "Procodes": True} "ClassifyS2": True, "Procodes": True}
''' '''
SCRIPT 1 --> Retrieve the data SCRIPT 1 --> Retrieve the data
...@@ -23,7 +23,7 @@ if(pieces["Retrieve"]): ...@@ -23,7 +23,7 @@ if(pieces["Retrieve"]):
basepath = './../Original_Data'; basepath = './../Original_Data';
os.chdir(basepath) os.chdir(basepath)
#get data from 2005-2012 #get data from 2006-2012
for Year in range(2007, 2008): for Year in range(2007, 2008):
print 'Year '+str(Year); print 'Year '+str(Year);
startYear = Year; startYear = Year;
......
This diff is collapsed.
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment