0
我想解析文檔,並且如果有與特定docno相關聯的名稱,請計算名稱的總數。在for循環結束之後,我要存儲名稱[docno] =字數。因此,如果namedict = {'henry':'','joe':''),henry在docno = doc 1 -4次並且joe 6中,字典會將其存儲爲('doc 1':10)。到目前爲止,我只能算出整個文本文件中的名稱總數。計算特定html標記中單詞的集合詞典
from xml.dom.minidom import *
import re
from string import punctuation
from operator import itemgetter
def parseTREC1 (atext):
fc = open(atext,'r').read()
fc = '<DOCS>\n' + fc + '\n</DOCS>'
dom = parseString(fc)
w_re = re.compile('[a-z]+',re.IGNORECASE)
doc_nodes = dom.getElementsByTagName('DOC')
namelist={'Matt':'', 'Earl':'', 'James':''}
default=0
indexdict={}
N=10
names={}
words={}
for doc_node in doc_nodes:
docno = doc_node.getElementsByTagName('DOCNO')[0].firstChild.data
cnt = 1
for p_node in doc_node.getElementsByTagName('P'):
p = p_node.firstChild.data
words = w_re.findall(p)
words_gen=(word.strip(punctuation).lower() for line in words
for word in line.split())
for aword in words:
if aword in namelist:
names[aword]=names.get(aword, 0) + 1
print names
# top_words=sorted(names.iteritems(), key=lambda(word, count): (-count, word))[:N]
# for word, frequency in top_words:
# print "%s: %d" % (word, frequency)
#print words + top_words
#print docno + "\t" + str(numbers)
parseTREC1('LA010189.txt')
感謝你 - 我得到它的工作。 – granimal 2011-05-01 04:28:33