是否有一個開放源代碼庫可以幫助我在.Net/C#中讀取/解析PDF文檔?在.Net中讀取PDF文檔
回答
iTextSharp是最好的選擇。用它爲lucene.Net製作蜘蛛,以便抓取PDF。
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen)/(float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
//() Bracket nesting level. Text appears inside()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
你可以看看這個: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx 這不是完全免費的,但它看起來非常好。
亞歷
這篇幫助將PDF轉換爲原始文本?似乎該工具將其轉換爲圖像。所以我需要一個OCR庫,然後:-) – JRoppert 2008-09-17 13:33:42
還有LibHaru
鏈接斷開。 HTTP:// libharu。org/ – TernaryTopiary 2017-05-08 06:37:18
另外:「在這個時候,libHaru不支持閱讀和編輯現有的PDF文件,這種支持不可能出現。」 這實際上是相關的嗎? – TernaryTopiary 2017-05-08 06:38:01
iText的是我知道的最好的圖書館。最初用Java編寫,還有一個.NET端口。
這不是一個官方的端口,並且該鏈接無論如何都被打破。 iText,iTextSharp的官方.NET端口可以在GitHub上找到:http://github.com/itext/itextsharp – 2015-12-09 15:39:55
我以前用ITextSharp來操作/分割和改造PDF文檔 - 它非常簡單,也是開源的。
aspose pdf工作得很好。然後再次,你必須支付它
public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
PdfReader reader2 = new PdfReader((string)Filename);
string strText = string.Empty;
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader((string)Filename);
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
strText = strText + s;
reader.Close();
}
return strText;
}
由於這個問題是在2008年最後回答,iTextSharp大大改善了他們的API。如果您從http://sourceforge.net/projects/itextsharp/下載最新版本的api,則可以使用以下代碼片段將PDF中的所有文本提取爲字符串。
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace PdfParser
{
public static class PdfTextExtractor
{
public static string pdfText(string path)
{
PdfReader reader = new PdfReader(path);
string text = string.Empty;
for(int page = 1; page <= reader.NumberOfPages; page++)
{
text += PdfTextExtractor.GetTextFromPage(reader,page);
}
reader.Close();
return text;
}
}
}
看看Docotic.Pdf library。它不要求你打開你的應用程序的源代碼(例如iTextSharp具有病毒性的AGPL 3許可證)。
Docotic.Pdf可用於閱讀PDF文件並提取帶或不帶格式的文本。請查看顯示how to extract text from PDFs的示例。
聲明:我爲圖書館供應商Bit Miracle工作。
- 1. 在.NET中讀取文檔
- 2. 在.NET中從PDF讀取文本
- 3. 在.net中讀取大型XML文檔
- 4. 在.NET中創建PDF/DOCX/HTML文檔
- 5. org.apache.pdfbox.pdmodel.PDDocument不加載/讀取PDF文檔
- 6. 在Android中閱讀PDF文檔
- 7. 如何從iPhone中的文檔目錄中讀取pdf文件?
- 8. 有沒有可能在iText庫中讀取pdf文檔android
- 9. 在PDF/A文檔中讀取和寫入xml元數據
- 10. 從iOS的pdf文檔中讀取文本和圖像
- 11. 在.Net中原生讀取Adobe Framemaker文檔?
- 12. 如何在.NET中讀取Microsoft Word文檔?
- 13. 在Asp.net中從PDF中讀取文本
- 14. 從PDF文檔中提取文本 - C#
- 15. 使用.NET在PDF文檔中提取標記爲新版本的文本
- 16. 從文檔中讀取plist
- 17. 在C#中編程讀取PDF文件#
- 18. 如何在android中讀取pdf文件?
- 19. 在節點js中讀取PDF文件
- 20. 獲取PDF文檔大綱
- 21. 讀取文件在c#.net
- 22. 如何在.NET中的pdf文檔中獲取頁面的方向?
- 23. 讀取XML文檔
- 24. 在PDF中添加pdf表格文檔
- 25. Asp .NET從tar.gz存檔中讀取文件
- 26. 在ASP.NET中讀取XML文檔
- 27. 如何在C#中讀取XML文檔
- 28. 在php中讀取word文檔
- 29. 在xslt中讀取xml文檔
- 30. 如何在asp.net中讀取word文檔
由布洛克Nusser提供的答案看起來像最先進的最新解決方案,應該被認爲是正確的答案這個問題 – ceetheman 2018-01-11 15:38:47