我有兩個文本文件(1)壞詞的樣本和(2)樣本的好詞。現在我執行最近鄰居分類,其中找到的新單詞將被分類爲好或差。我希望瞭解如何使用我現有的代碼來處理此問題。由於最近鄰的文本分類
class Words_Works():
def __init__(self):
self.all_texts = {}
self.categories = {}
self.knn_results = {}
self.stop_words = ['and','the','i','am','he','she','his',
'me','my','a','at','in','you','your',
'of','to','this','that','him','her',
'they','is','it','can','for','into',
'as','with','we','us','them','a',
'it', 'on', 'so', 'too','k','the',
'but', 'are','though'
'very', 'here', 'even', 'from',
'then', 'than']
self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
def add_category(self,f,cat_name):
f_in = open(f)
self.text = f_in.read().lower()
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.categories[cat_name] = {}
for item in self.unleaf:
if self.categories[cat_name].has_key(item):
self.categories[cat_name][item] += 1
else:
self.categories[cat_name][item] = 1
def load_categories(self):
try:
cat_db = open('tweetCategory.txt','rb')
self.categories = cPickle.load(cat_db)
cat_db.close()
print 'File successfully loaded from categories db'
except:
print 'File not loaded from categories_db'
# Finds the levenshtein's distance
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""
if len(first) > len(second):
first, second = second, first
if len(second) == 0:
return len(first)
first_length = len(first) + 1
second_length = len(second) + 1
distance_matrix = [[0] * second_length for x in range(first_length)]
for i in range(first_length):
distance_matrix[i][0] = i
for j in range(second_length):
distance_matrix[0][j]=j
for i in xrange(1, first_length):
for j in range(1, second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
substitution = distance_matrix[i-1][j-1]
if first[i-1] != second[j-1]:
substitution += 1
distance_matrix[i][j] = min(insertion, deletion, substitution)
return distance_matrix[first_length-1][second_length-1]
def add_text(self,f):
f_in = open(f)
self.text = f_in.read().lower()
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.all_texts[f] = {}
for item in self.unleaf:
if self.all_texts[f].has_key(item):
self.all_texts[f][item] += 1
else:
self.all_texts[f][item] = 1
def save_categories(self):
cat_db = open('tweetCategory.txt','wb')
cPickle.dump(cat_db,self.categories,-1)
cat_db.close()
def unstopify(self):
self.unstop = [item for item in self.words if item not in self.stop_words]
def unleafify(self):
self.unleaf = self.unstop[:]
for leaf in self.leaf_words:
leaf_len = len(leaf)
leaf_pattern = re.compile('%s$' % leaf)
for i in range(len(self.unleaf)):
if leaf_pattern.findall(self.unleaf[i]):
self.unleaf[i] = self.unleaf[i][:-leaf_len]
def wordify(self):
words_pattern = re.compile('//w+')
self.words = words_pattern.findall(self.text)
def knn_calc(self):
for text in self.all_texts.keys():
self.knn_results[text] = {}
for category in self.categories.keys():
self.knn_results[text][category] = {}
iterations = 0
distance = 0
for word in self.all_texts[text].keys():
if word in self.categories[text].keys():
distance = levenshtein_distance(text,category)
self.knn_results[text][category]['Knn Distance'] = distance
self.knn_results [text][category]['Knn Iterations'] = iterations
def knn(self):
for text in self.all_texts.keys():
Result = None
for category in self.categories.keys():
if not result or self.knn_results[text][category]['Knn Distance'] < result:
knn = category
distance = self.knn_results[text][category]['Knn Distance']
iterations = self.knn_results[text][category]['Knn Iterations']
print 'File:',text
print 'Knn:',category
print 'Distance :', distance
print 'Iterations :', iterations
print 'End of nearest neighbour search'
和測試用例嘗試一下:
mywork = Words_Works()
positive = 'positive.txt'
mywork.add_category(positive, 'Positive Tweets') # Adding as category
negative = 'negative.txt'
mywork.add_category(negative, 'Negative Tweets')
neutral = 'neutral.txt'
mywork.add_category(neutral, 'Neutral Tweets')
for category in mywork.categories.keys(): # Print categories
print category
print mywork.categories[category]
print
print
txts = ('samplegood.txt', 'samplebad.txt') # Creating list of files to
for text in txts: # Adding them
mywork.add_text(text)
for text in mywork.all_texts.keys(): # Print texts
print text
print mywork.all_texts[text]
print
print
mywork.knn_calc() # calculate knn
for files in mywork.knn_results.keys(): # print detailed results
print files
for category in mywork.knn_results[files].keys():
print category
print mywork.knn_results[files][category]
print
print
mywork.knn() # Display results
我已經實現了edit_distance,任何可以呈現的幫助來實現我的解決方案 – saopayne