好吧我現在想出了這一個。基本上64位IFilter工作不正常。它合併由換行符分隔的單詞,但不會將它們帶入。我使用Ionic.zip訪問docx zip存檔並使用DocxToText的稍微修改版本分析重要的xml文件。這現在完美。
這裏最初是由Jevgenij潘科夫創建
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;
public class DocxToText
{
private const string ContentTypeNamespace =
@"http://schemas.openxmlformats.org/package/2006/content-types";
private const string WordprocessingMlNamespace =
@"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private const string DocumentXmlXPath =
"/t:Types/t:Override[@ContentType=\"" +
"application/vnd.openxmlformats-officedocument." +
"wordprocessingml.document.main+xml\"]";
private const string BodyXPath = "/w:document/w:body";
private string docxFile = "";
private string docxFileLocation = "";
public DocxToText(string fileName)
{
docxFile = fileName;
}
#region ExtractText()
///
/// Extracts text from the Docx file.
///
/// Extracted text.
public string ExtractText()
{
if (string.IsNullOrEmpty(docxFile))
throw new Exception("Input file not specified.");
// Usually it is "/word/document.xml"
docxFileLocation = FindDocumentXmlLocation();
if (string.IsNullOrEmpty(docxFileLocation))
throw new Exception("It is not a valid Docx file.");
return ReadDocumentXml();
}
#endregion
#region FindDocumentXmlLocation()
///
/// Gets location of the "document.xml" zip entry.
///
/// Location of the "document.xml".
private string FindDocumentXmlLocation()
{
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
// Find "[Content_Types].xml" zip entry
if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
//Create an XmlNamespaceManager for resolving namespaces
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("t", ContentTypeNamespace);
// Find location of "document.xml"
XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
DocumentXmlXPath, nsmgr);
if (node != null)
{
string location =
((XmlElement)node).GetAttribute("PartName");
return location.TrimStart(new char[] { '/' });
}
break;
}
}
}
return null;
}
#endregion
#region ReadDocumentXml()
///
/// Reads "document.xml" zip entry.
///
/// Text containing in the document.
private string ReadDocumentXml()
{
StringBuilder sb = new StringBuilder();
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("w", WordprocessingMlNamespace);
XmlNode node =
xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);
if (node == null)
return string.Empty;
sb.Append(ReadNode(node));
break;
}
}
}
return sb.ToString();
}
#endregion
#region ReadNode()
///
/// Reads content of the node and its nested childs.
///
/// XmlNode.
/// Text containing in the node.
private string ReadNode(XmlNode node)
{
if (node == null || node.NodeType != XmlNodeType.Element)
return string.Empty;
StringBuilder sb = new StringBuilder();
foreach (XmlNode child in node.ChildNodes)
{
if (child.NodeType != XmlNodeType.Element) continue;
switch (child.LocalName)
{
case "t": // Text
sb.Append(child.InnerText.TrimEnd());
string space =
((XmlElement)child).GetAttribute("xml:space");
if (!string.IsNullOrEmpty(space) &&
space == "preserve")
sb.Append(' ');
break;
case "cr": // Carriage return
case "br": // Page break
sb.Append(Environment.NewLine);
break;
case "tab": // Tab
sb.Append("\t");
break;
case "p": // Paragraph
sb.Append(ReadNode(child));
sb.Append(Environment.NewLine);
sb.Append(Environment.NewLine);
break;
default:
sb.Append(ReadNode(child));
break;
}
}
return sb.ToString();
}
#endregion
}
下面是該代碼使用修改後的代碼...
DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();
屬於超級用戶 – Oded
看過ifilter後......似乎現在的實現有問題......我自己解析docx xml以避免出現這樣的問題 – bastianneu
Thanks Bastianneu。 是否可以擴展IFilter來糾正這個問題? 由於IFilter工作在PDF和舊版本的doc上,所以使用不同的只適用於這種新類型並不是一個好策略。 – Txugo