最近鄰的文本分類

我有兩個文本文件（1）壞詞的樣本和（2）樣本的好詞。現在我執行最近鄰居分類，其中找到的新單詞將被分類爲好或差。我希望瞭解如何使用我現有的代碼來處理此問題。由於最近鄰的文本分類

class Words_Works(): 

def __init__(self): 
    self.all_texts = {} 
    self.categories = {} 
    self.knn_results = {} 
    self.stop_words = ['and','the','i','am','he','she','his', 
         'me','my','a','at','in','you','your', 
         'of','to','this','that','him','her', 
         'they','is','it','can','for','into', 
         'as','with','we','us','them','a', 
         'it', 'on', 'so', 'too','k','the', 
         'but', 'are','though' 
         'very', 'here', 'even', 'from', 
         'then', 'than'] 

    self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing'] 

def add_category(self,f,cat_name): 
    f_in = open(f) 
    self.text = f_in.read().lower() 
    f_in.close() 
    self.wordify() 
    self.unstopify() 
    self.unleafify() 
    self.categories[cat_name] = {} 
    for item in self.unleaf: 
     if self.categories[cat_name].has_key(item): 
      self.categories[cat_name][item] += 1 
     else: 
      self.categories[cat_name][item] = 1 


def load_categories(self): 
    try: 
     cat_db = open('tweetCategory.txt','rb') 
     self.categories = cPickle.load(cat_db) 
     cat_db.close() 
     print 'File successfully loaded from categories db' 
    except: 
     print 'File not loaded from categories_db' 


     # Finds the levenshtein's distance 
def levenshtein_distance(first, second): 
"""Find the Levenshtein distance between two strings.""" 
if len(first) > len(second): 
    first, second = second, first 
    if len(second) == 0: 
     return len(first) 
     first_length = len(first) + 1 
     second_length = len(second) + 1 
     distance_matrix = [[0] * second_length for x in range(first_length)] 
     for i in range(first_length): 
      distance_matrix[i][0] = i 
      for j in range(second_length): 
       distance_matrix[0][j]=j 
       for i in xrange(1, first_length): 
        for j in range(1, second_length): 
         deletion = distance_matrix[i-1][j] + 1 
         insertion = distance_matrix[i][j-1] + 1 
         substitution = distance_matrix[i-1][j-1] 
         if first[i-1] != second[j-1]: 
          substitution += 1 
          distance_matrix[i][j] = min(insertion, deletion, substitution) 
    return distance_matrix[first_length-1][second_length-1] 

def add_text(self,f): 
    f_in = open(f) 
    self.text = f_in.read().lower() 
    f_in.close() 
    self.wordify() 
    self.unstopify() 
    self.unleafify() 
    self.all_texts[f] = {} 
    for item in self.unleaf: 
     if self.all_texts[f].has_key(item): 
      self.all_texts[f][item] += 1 
     else: 
      self.all_texts[f][item] = 1 

def save_categories(self): 
    cat_db = open('tweetCategory.txt','wb') 
    cPickle.dump(cat_db,self.categories,-1) 
    cat_db.close() 

def unstopify(self): 
    self.unstop = [item for item in self.words if item not in self.stop_words] 

def unleafify(self): 
    self.unleaf = self.unstop[:] 
    for leaf in self.leaf_words: 
     leaf_len = len(leaf) 
     leaf_pattern = re.compile('%s$' % leaf) 
     for i in range(len(self.unleaf)): 
      if leaf_pattern.findall(self.unleaf[i]): 
       self.unleaf[i] = self.unleaf[i][:-leaf_len] 

def wordify(self): 
    words_pattern = re.compile('//w+') 
    self.words = words_pattern.findall(self.text) 

def knn_calc(self): 
    for text in self.all_texts.keys(): 
     self.knn_results[text] = {} 
     for category in self.categories.keys(): 
      self.knn_results[text][category] = {} 
      iterations = 0 
      distance = 0 
      for word in self.all_texts[text].keys(): 
       if word in self.categories[text].keys(): 

        distance = levenshtein_distance(text,category) 
        self.knn_results[text][category]['Knn Distance'] = distance 
        self.knn_results [text][category]['Knn Iterations'] = iterations 


def knn(self): 
    for text in self.all_texts.keys(): 
     Result = None 
     for category in self.categories.keys(): 
      if not result or self.knn_results[text][category]['Knn Distance'] < result: 
       knn = category 
       distance = self.knn_results[text][category]['Knn Distance'] 
       iterations = self.knn_results[text][category]['Knn Iterations'] 

       print 'File:',text 
       print 'Knn:',category 
       print 'Distance :', distance 
       print 'Iterations :', iterations 
       print 'End of nearest neighbour search'

和測試用例嘗試一下：

mywork = Words_Works() 

positive = 'positive.txt' 
mywork.add_category(positive, 'Positive Tweets')    # Adding as category 
negative = 'negative.txt' 
mywork.add_category(negative, 'Negative Tweets') 
neutral = 'neutral.txt' 
mywork.add_category(neutral, 'Neutral Tweets') 

for category in mywork.categories.keys():    # Print categories 
    print category 
    print mywork.categories[category] 
    print 
print 

txts = ('samplegood.txt', 'samplebad.txt')     # Creating list of files to 

for text in txts:          # Adding them 
    mywork.add_text(text) 

for text in mywork.all_texts.keys():     # Print texts 
    print text 
    print mywork.all_texts[text] 
    print  
print 

mywork.knn_calc()           # calculate knn 

for files in mywork.knn_results.keys():     # print detailed results 
    print files 
    for category in mywork.knn_results[files].keys(): 
     print category 
     print mywork.knn_results[files][category] 
    print 
print  

mywork.knn()            # Display results

來源

2014-02-14 saopayne

兩條建議：首先，如@ YvesDaoust所述，您應該使用編輯距離，也稱爲Levenshtein距離。你可以在python-Levenshtein package找到它。

其次，使用標準庫中的unittest或doctest庫來測試您的代碼。使用保存在外部文件中的示例來測試代碼是一個壞主意，因爲沒有訪問這些文件的第三個人（例如我們）無法知道輸入是什麼;避免打印輸出並手動檢查它，因爲這很慢，容易出錯並且不能被其他人檢查。

來源

2014-02-14 14:22:16 logc

我已經實現了edit_distance，任何可以呈現的幫助來實現我的解決方案 – saopayne

使用編輯距離，你是不是在歐氏空間。 http://en.wikipedia.org/wiki/Edit_distance

來源

2014-02-14 14:07:48

我已經實現了edit_distance，任何可以呈現的幫助來實現我的解決方案。 – saopayne

最近鄰的文本分類

回答

相關問題