import re 
import nltk 
import string 
from collections import Counter 
from nltk.corpus import stopwords 
from collections import defaultdict, Counter 
from nltk.corpus import brown 

brown = nltk.corpus.brown 
stoplist = stopwords.words('english') 

from collections import defaultdict 

def toptenwords(brown): 
    words = brown.words() 
    no_capitals = ([word.lower() for word in words]) 
    filtered = [word for word in no_capitals if word not in stoplist] 
    translate_table = dict((ord(char), None) for char in string.punctuation) 
    no_punct = [s.translate(translate_table) for s in filtered] 
    wordcounter = defaultdict(int) 
    for word in no_punct: 
     if word in wordcounter: 
      wordcounter[word] += 1 
      wordcounter[word] = 1 
    sorting = [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)] 
    return sorting 


words_2 = [word[0] for word in brown.tagged_words(categories="news")] 
# the most frequent words 
print Counter(words_2).most_common(10) 

words_2 = [word[1] for word in brown.tagged_words(categories="news")] 
# the most frequent word class 
print Counter(words_2).most_common(10) 

# Keeps words and pos into a dictionary 
# where the key is a word and 
# the value is a counter of POS and counts 
word_tags = defaultdict(Counter) 
for word, pos in brown.tagged_words(): 
word_tags[word][pos] +=1 

# To access the POS counter. 
print 'Red', word_tags['Red'] 
print 'Marlowe', word_tags['Marlowe'] 

# Greatest number of distinct tag. 
word_with_most_distinct_pos = sorted(word_tags, key=lambda x: len(word_tags[x]), reverse=True)[0] 

print word_with_most_distinct_pos 
print word_tags[word_with_most_distinct_pos] 
print len(word_tags[word_with_most_distinct_pos]) 

# which word has the greatest number of distinct tags 
word_tags_2 = nltk.defaultdict(lambda: set()) 
for word, token in tagged_words: 
    ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()]), 
    key=itemgetter(1), reverse=True)[:50] 
    print [(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words] 


File "Oblig2a.py", line 64 
    key=itemgetter(1), reverse=True)[:50] 
SyntaxError: invalid syntax 


  1. 最常說的一句話
  2. 最常用單詞分類
  3. 最不頻繁單詞分類
  4. 與多個字班有多少個字
  5. 哪個詞有最標籤,有多少不同的標籤是有
  6. 我需要幫助的最後一件事是一個函數寫一個特定的詞,寫就怎麼很多時候它會出現在每個標籤中。我試圖做到這一點上面,但我不能得到它的工作...

它是數字3,4,5和6我需要幫助... 任何幫助將是非常受歡迎的。


看堆棧跟蹤。違規行顯然是'stoplist = stopwords.words(brown)'。此方法需要文件ID,但不是一系列標記的單詞(這是您分配給變量「brown」的內容)。 – lenz


我該如何改變它? –


您應該爲該功能提供該語言的名稱,例如'stoplist = stopwords.words('english')' –




  1. 錯誤什麼的解釋是告訴你 - 你應該到停止字功能提供的語言名稱:stoplist = stopwords.words('english')
  2. 使用defaultdict字典get方法正確排序字典: [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)]
  3. 上使用Unicode數據轉換表,見string.translate() with unicode data in python
  4. 布朗標記詞在FO元組RMAT (word, part-of-speech)


import re 
import nltk 
import string 
from collections import Counter 
from nltk.corpus import stopwords 

brown = nltk.corpus.brown 
stoplist = stopwords.words('english') 

from collections import defaultdict 

def toptenwords(brown): 
    words = brown.words() 
    no_capitals = set([word.lower() for word in words]) 
    filtered = [word for word in no_capitals if word not in stoplist] 
    translate_table = dict((ord(char), None) for char in string.punctuation) 
    no_punct = [s.translate(translate_table) for s in filtered] 
    wordcounter = defaultdict(int) 
    for word in no_punct: 
     if word in wordcounter: 
      wordcounter[word] += 1 
      wordcounter[word] = 1 
    sorting = [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)] 
    return sorting 


words_2 = [word[0] for word in brown.tagged_words(categories="news")] 
# the most frequent words 
print Counter(words_2).most_common(10) 

words_2 = [word[1] for word in brown.tagged_words(categories="news")] 
# the most frequent word class 
print Counter(words_2).most_common(10) 

謝謝!但是,我如何從這段代碼中得到最少使用的單詞和單詞類? –


檢查此主題http:// stackoverflow。com/questions/4743035/python-3-1-obtain-the-least-common-elements-array –


我試過了,但是我沒有得到我想要的輸出。 –
