import nltk
import string
from nltk.corpus import stopwords
from collections import Counter
def get_tokens():
with open('comet_interest.xml','r') as bookmark:
text=bookmark.read()
lowers=text.lower()
no_punctuation=lowers.translate(None,string.punctuation)
tokens=nltk.word_tokenize(no_punctuation)
return tokens
#remove stopwords
tokens=get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print count.most_common(10)
#stemming
from nltk.stem.porter import *
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print count.most_common(10)
結果表明這樣的:爲什麼PortStemmer在NLTK我的 「字符串」 轉換爲U 「串」
[( '分析',13),( '空間',11),(」 (''''''),(''''''),(''''''),(''''''), ,(u'spatial',11),(u'use',11),(u'feb',11),(u'spatial',11), 8),(u'cdata',8),(u'scienc',7),(u'descript',7),(u'item',6),(u'includ',6) 'mani',6)]
seco有什麼問題找出一個詞,爲什麼每個詞都有一個「你」的頭?
...因爲它們是Unicode字符串? – kindall 2015-02-12 04:20:42
哦。但爲什麼第一個不是Unicode?以及如何從Unicode轉換爲字符串? – 2015-02-12 16:49:56