一個docx文件只是一個zip文件(嘗試運行unzip就可以了!),其中包含大量定義良好的XML和附屬文件。
import zipfile
from lxml import etree
def get_word(docx_file_name):
with open(docx_file_name) as f:
zip = zipfile.SipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content
#parse the string containing XML into a usable tree
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
#xml has functions for traversing the XML tree, but I used the iter instead that
#will traverse every node given a starting node 」my_etree」, and return every
#text node and it’s containing text
def _itertext(self, myetree):
"""goes through the xml tree and extracts nodes"""
for node in my_etree.iter(tag=etree.Element):
if self._check_element_is(node, 't'):
yield(node, node.text)
def _check_element_is(self, element, typr_char):
word_schema = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
return element.tag == '{%s}%s' %(word_schema, type_char)
xml_from_file = self.get_word_xml(wod_filename)
xml_tree = self.get_xml_tree(xml_from_file)
for node, txt in self._itertext(xml_tree):
print txt
找到更多here
來源
2014-02-10 10:16:21
Olu
'.doc'可能是一個專有的微軟Word文件。你不能像純文本文件那樣閱讀它。 – 2014-02-10 09:39:56
你可以用word打開它們,並將它們保存爲.txt文件嗎? –
@tk,還沒有嘗試過那個。它安全嗎?如果用戶不具有單詞應用程序會怎麼樣? – Bazinga