首先,下面的代碼按原樣運行。我更多的是Ruby程序員,所以我仍然感覺我在Python中的方式,我相信,必須有更多的DRY方法來完成我在下面做的事情。Pythonic:收集任意字符串 - 索引器


import sys 
wordcount = {} 
last_word = "" 
last_last_word = "" 
last_last_last_word = "" 

for word in file.read().split(): 
    if word not in wordcount: 
     wordcount[word] = 1 
     wordcount[word] += 1 

    if last_last_last_word != "": 
     if "{} {} {} {}".format(last_last_last_word,last_last_word,last_word,word) not in wordcount: 
      wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] = 1 
      wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] += 1 
    last_last_last_word = last_last_word 

    if last_last_word != "": 
     if last_last_word + " " + last_word + " " + word not in wordcount: 
      wordcount[last_last_word + " " + last_word + " " + word ] = 1 
      wordcount[last_last_word + " " + last_word + " " + word ] += 1 
    last_last_word = last_word 

    if last_word != "": 
     if last_word + " " + word not in wordcount: 
      wordcount[last_word + " " + word] = 1 
      wordcount[last_word + " " + word] += 1 
    last_word = word 

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True): 
    print k,v 



this is a sample input file an input file will always be all lower case with no punctuation 


file 2 
input 2 
input file 2 
an input file 1 
all 1 
lower case 1 
be 1 
is 1 
file will always 1 
an 1 
sample 1 
case 1 
always be all lower 1 
this is a 1 
will always be 1 
sample input file 1 
will always 1 
is a sample 1 
all lower 1 
lower case with no 1 
no 1 
with 1 
with no 1 
file will always be 1 
with no punctuation 1 
lower 1 
be all lower case 1 
no punctuation 1 
an input file will 1 
input file an 1 
file an 1 
input file an input 1 
always be 1 
file an input file 1 
be all 1 
is a 1 
input file will 1 
file will 1 
an input 1 
input file will always 1 
will always be all 1 
always be all 1 
lower case with 1 
a sample 1 
a sample input file 1 
a sample input 1 
is a sample input 1 
be all lower 1 
a 1 
sample input file an 1 
sample input 1 
case with no punctuation 1 
all lower case with 1 
this 1 
always 1 
file an input 1 
case with 1 
case with no 1 
will 1 
all lower case 1 
punctuation 1 
this is 1 
this is a sample 1 



那麼你是指「四個單詞短語」呢?你能給我們一個輸入和期望輸出的例子嗎? –


我認爲他的意思是四個字的短語。 – Pablo


@Pablo:那麼如何抓住四個字的短語呢? - 對於OP:你的意思是隻是分割塊'file.read()。split()'? –




string="this is a sample input file an input file will always be all lower case with no punctuation" 

def words(count): 
    return [" ".join(string.split()[a:b]) for a in range(len(string.split())) for b in range(a+count+1) if len(string.split()[a:b]) == count] 



lst = words(3) 


for word in set(lst): 
    print word, lst.count(word) 

an input file 1 
file will always 1 
is a sample 1 
be all lower 1 
file an input 1 
with no punctuation 1 
input file will 1 
lower case with 1 
this is a 1 
always be all 1 
will always be 1 
sample input file 1 
a sample input 1 
all lower case 1 
case with no 1 
input file an 1 




words_list = string.split() 
words_dict = {} 

for a in range(len(words_list)): 
    for b in range(a): 
     phrase = " ".join(words_list[b:a]) 
     if phrase in words_dict: 
      words_dict[phrase] += 1 
      words_dict[phrase] = 1 

for i in words_dict: 
    print i, words_dict[i] 



這與所提供的不匹配輸出。 – UtsavShah


這也是非常低效的,調用list.count是一個非常糟糕的方法來獲得計數。 –


讓我試試別的,只是一會兒。 – Rockybilly




import sys 
from collections import defaultdict 


wordcount = defaultdict(int) 
wordlist = ["" for i in range(int(sys.argv[2]))] 

def check(wordcount, wordlist, word): 

    for i, word in enumerate(wordlist): 
     if word != "": 
      current = "".join([w + " " for w in wordlist[i:]]) 
      wordcount[current] += 1 

    return wordlist[1:] 

for word in file.read().split(): 
    wordlist = check(wordcount, wordlist, word) 

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True): 
    print k,v 

這仍然不會幹。 –


另外,我想你已經打破了四個單詞短語的邏輯。 –


@DavidHoelzer現在看看? – UtsavShah



from collections import Counter 
import itertools 
import operator as op 

def count_phrases(words, phrase_len): 
    return reduce(op.add, 
    (Counter(tuple(words[i:i+l]) for i in xrange(len(words)-l+1)) for l in phrase_len)) 


words = "a b c a a".split() 
for phrase, count in count_phrases(words, [1, 2]).iteritems(): 
    print " ".join(phrase), counts 


b c 1 
a 3 
c 1 
b 1 
c a 1 
a a 1 
a b 1 

爲什麼downvote? –


我沒有downvote,但邏輯被打破。這些空間是必需的,不再在您的解決方案中維護。 – UtsavShah


@UtsavShah空間是如何來的? –



def parser(data,size): 
    chunked = data.split() 
    phrases = [] 
    for i in xrange(len(chunked)-size): 
     phrase=' '.join(chunked[i:size+i]) 
    return phrases 

def parse_file(fname,size):  
    result = [] 
    with open(fname,'r') as f:  
     for data in f.readlines(): 
      for i in xrange(1,size): 

    return Counter(result) 

result= parse_file('file.txt',4) 
print sorted(result.items(),key=lambda x:x[1],reverse=True) 

[('file', 2), 
('input', 2), 
('input file', 2), 
('an input file', 1), 
('all', 1), 
('always be all', 1), 
('is', 1), 
('an', 1), 
('sample', 1), 
('this is a', 1), 
('will always be', 1), 
('sample input file', 1), 
('will always', 1), 
('is a sample', 1), 
('all lower', 1), 
('no', 1), 
('with no', 1), 
('lower case', 1), 
('case', 1), 
('input file will', 1), 
('case with no', 1), 
('input file an', 1), 
('file an', 1), 
('be', 1), 
('always be', 1), 
('be all lower', 1), 
('be all', 1), 
('lower', 1), 
('is a', 1), 
('an input', 1), 
('a sample input', 1), 
('lower case with', 1), 
('a sample', 1), 
('file will', 1), 
('with', 1), 
('a', 1), 
('file will always', 1), 
('sample input', 1), 
('this', 1), 
('always', 1), 
('file an input', 1), 
('case with', 1), 
('will', 1), 
('all lower case', 1), 
('this is', 1)] 

你已經打開了一個沒有管理上下文的文件,並忘記關閉它。 –


正在運行的文件將非常龐大。首先將整個文件讀入內存看起來不是最佳的。 –


你也可以使用yield。我可以更新代碼,如果這是唯一的問題 –



import sys 
wordcount = {} 
nb_words = 4 
last_words = [] 

for word in file.read().split(): 
    last_words = [word] + last_words 
    if len (last_words) > nb_words: 
    for i in range(len(last_words)-1,-1,-1): 
     if last_words[i] != "": 
      key = ' '.join(last_words[:i+1]) 
      if key not in wordcount: 
       wordcount[key] = 1 
       wordcount[key] += 1 

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True): 
    print k,v 

我編程的循環來替代變量。所以現在你有一個參數超過4個單詞。 編輯:經過一些錯誤修正後,我現在確定它產生相同的輸出



import re 
import mmap 
from itertools import islice, izip, tee 
from collections import Counter 
from pprint import pprint 

def word_grouper(filename, size): 
    counts = Counter() 
    with open(filename) as fin: 
     mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ) 
     words = (m.group() for m in re.finditer('[a-z]+', mm)) 
     sliding = [islice(w, n, None) for n, w in enumerate(tee(words, size+1))] 
     for slide in izip(*sliding): 
      counts.update(slide[:n] for n in range(1, len(slide))) 

    return counts 

counts = word_grouper('input filename', 4) 
# do appropriate formatting instead of just `pprint`ing 


[(('file',), 2), 
(('input', 'file'), 2), 
(('input',), 2), 
(('a', 'sample', 'input'), 1), 
(('file', 'will', 'always', 'be'), 1), 
(('sample', 'input', 'file', 'an'), 1), 
(('this', 'is', 'a', 'sample'), 1), 
(('this', 'is'), 1), 
(('will',), 1), 
(('lower', 'case', 'with'), 1), 
(('an', 'input', 'file', 'will'), 1), 
(('sample', 'input'), 1), 
(('is', 'a'), 1), 
(('all', 'lower', 'case', 'with'), 1), 
(('input', 'file', 'will'), 1), 
(('an',), 1), 
(('always', 'be'), 1), 
(('lower', 'case', 'with', 'no'), 1), 
(('an', 'input'), 1), 
(('be', 'all', 'lower'), 1), 
(('this',), 1), 
(('be', 'all', 'lower', 'case'), 1), 
(('this', 'is', 'a'), 1), 
(('sample',), 1), 
(('sample', 'input', 'file'), 1), 
(('will', 'always', 'be', 'all'), 1), 
(('a',), 1), 
(('a', 'sample'), 1), 
(('is', 'a', 'sample'), 1), 
(('will', 'always'), 1), 
(('lower',), 1), 
(('lower', 'case'), 1), 
(('file', 'an'), 1), 
(('file', 'an', 'input'), 1), 
(('file', 'will'), 1), 
(('is',), 1), 
(('all', 'lower'), 1), 
(('input', 'file', 'an', 'input'), 1), 
(('always', 'be', 'all', 'lower'), 1), 
(('an', 'input', 'file'), 1), 
(('input', 'file', 'an'), 1), 
(('be', 'all'), 1), 
(('input', 'file', 'will', 'always'), 1), 
(('be',), 1), 
(('all',), 1), 
(('always', 'be', 'all'), 1), 
(('is', 'a', 'sample', 'input'), 1), 
(('always',), 1), 
(('all', 'lower', 'case'), 1), 
(('file', 'an', 'input', 'file'), 1), 
(('file', 'will', 'always'), 1), 
(('a', 'sample', 'input', 'file'), 1), 
(('will', 'always', 'be'), 1)]