2016-09-15 69 views
0

我想搜索一個PDF文件的單詞並將其替換。例如。搜索「錯誤」並替換爲「正確」。 我已經設法使用iText v5.5.9(禮貌http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm)做一個測試程序哪種工作正常(文本被替換似乎在頂部)。 我想知道如果v7 iText會更好/更簡單,如果有人已經做到了/可以提供幫助。 下面是V5測試代碼讀取用的R/W密碼從數據庫的PDF然後用R/W密碼並將其寫入:iText7 .NET搜索/替換

' Based on http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm 

Imports System.IO 'Working With Files 
Imports System.Text 'Working With Text 
Imports System.Data.SqlClient 

Imports iTextSharp.text 'Core PDF Text Functionalities 
Imports iTextSharp.text.pdf 'PDF Content 
Imports iTextSharp.text.pdf.parser 'Content Parser 

Imports pdf_clr.LocTextExtraction 'Import LocationTextExtractionStrategy Capabilities 

Public Class Class1 

Public Shared Sub ReplacePDFText(ByVal strSource As String, ByVal strDest As String, ByVal iDocType As SByte, ByVal strSearch As String, ByVal strReplace As String, ByVal bCase As Boolean) 
    ' strSource is an int 
    Dim i As Integer 
    Dim strSqlConnection As String = "context connection=true" 
    strSqlConnection = "Data Source=SERVER;Initial Catalog=DATABASE;Integrated Security=True" 
    Dim dbPDF As Byte() = Nothing 'For doc from database 
    Dim pcbContent As PdfContentByte = Nothing 'Read PDF Content 
    Dim psStamp As PdfStamper = Nothing 'PDF Stamper Object 
    Dim strPassword As String = strSource 

    Using connection As New SqlConnection(strSqlConnection) 
     connection.Open() 
     Dim command As New SqlCommand("SELECT pdf FROM docstore WHERE id=" & strSource, connection) 
     dbPDF = command.ExecuteScalar() 
    End Using 

    If IsNothing(dbPDF) <> True Then 'Check if dbPDF filled 

     'Dim pdfFileReader As New PdfReader(strSource, Encoding.ASCII.GetBytes(strPassword)) 'Read Our File 
     Dim pdfFileReader As New PdfReader(dbPDF, Encoding.ASCII.GetBytes(strPassword)) 'Read PDF 

     If strDest.ToString = "" Then 
      'strDest = System.IO.Path.GetTempPath() & System.IO.Path.GetRandomFileName() 
      strDest = "C:\tmp\" & System.IO.Path.GetRandomFileName() & ".pdf" 
     End If 

     Dim msPDF As New MemoryStream() 

     psStamp = New PdfStamper(pdfFileReader, msPDF) 'Memorystream as destination 
     psStamp.Writer.CloseStream = False 

     ' set r/w password to 
     psStamp.SetEncryption(Nothing, Encoding.ASCII.GetBytes(strPassword), PdfWriter.ALLOW_PRINTING, PdfWriter.DO_NOT_ENCRYPT_METADATA) 

     For intCurrPage As Integer = 1 To pdfFileReader.NumberOfPages 'Loop Through All Pages 

      Dim lteStrategy As LocTextExtractionStrategy = New LocTextExtractionStrategy 'Read PDF File Content Blocks 

      pcbContent = psStamp.GetUnderContent(intCurrPage) 'Look At Current Block 

      'Determine Spacing of Block To See If It Matches Our Search String 
      lteStrategy.UndercontentCharacterSpacing = pcbContent.CharacterSpacing 
      lteStrategy.UndercontentHorizontalScaling = pcbContent.HorizontalScaling 

      'Trigger The Block Reading Process 
      Dim currentText As String = PdfTextExtractor.GetTextFromPage(pdfFileReader, intCurrPage, lteStrategy) 
      Dim scCase As StringComparison = IIf(bCase = 0, StringComparison.CurrentCultureIgnoreCase, StringComparison.CurrentCulture) 

      'Call 
      DoSearchReplace(lteStrategy, pcbContent, psStamp, strSearch, strReplace, scCase, "SearchReplaceLayer") 

     Next 'page 

     psStamp.Close() 'Close Stamp Destination Object 

     msPDF.Position = 0 

     dbPDF = msPDF.ToArray 

     msPDF.Close() 
     msPDF.Dispose() 

     ' Write file as check during testing 
     File.WriteAllBytes(strDest, dbPDF) 

     If IsNumeric(strSource) And 1 = 1 Then 
      Using connection As New SqlConnection(strSqlConnection) 
       Dim cmd As New SqlCommand 
       cmd.CommandText = "sp_DOCSTORE_ADD_binary" ' updates or inserts into db 
       ' stored procedure parameters as needed 
       cmd.Parameters.Add("@FILE", Data.SqlDbType.VarBinary) : cmd.Parameters("@FILE").Value = dbPDF 
       cmd.Parameters.Add("@retvalue", Data.SqlDbType.Int).Direction = Data.ParameterDirection.ReturnValue 
       cmd.CommandType = Data.CommandType.StoredProcedure 
       cmd.Connection = connection 

       connection.Open() 

       i = cmd.ExecuteNonQuery() 

      End Using 
     End If 

    End If 

End Sub 

Public Shared Sub DoSearchReplace(ByRef lteStrategy As LocTextExtractionStrategy, ByRef pcbContent As PdfContentByte, ByRef psStamp As PdfStamper, ByVal strSearch As String, ByVal strReplace As String, ByVal scCase As StringComparison, ByVal strLayer As String) 
    'Determine Match(es) 
    Dim lstMatches As List(Of iTextSharp.text.Rectangle) = lteStrategy.GetTextLocations(strSearch, scCase) 
    Dim pdLayer As New PdfLayer(strLayer, psStamp.Writer) 'New layer and enable Overwriting Capabilities 

    'Set Fill Colour Of Replacing Layer 
    pcbContent.SetColorFill(BaseColor.WHITE) 

    For Each rctRect As Rectangle In lstMatches 'Loop Through Each Match 

     pcbContent.Rectangle(rctRect.Left, rctRect.Bottom, rctRect.Width, rctRect.Height) 'Create New Rectangle For Replacing Layer 
     pcbContent.Fill() 'Fill With Colour Specified 
     pcbContent.BeginLayer(pdLayer) 'Create Layer 
     pcbContent.SetColorFill(BaseColor.DARK_GRAY) 'Fill Layer 
     pcbContent.Fill() 'Fill Underlying Content 

     Dim pgState As PdfGState 'Create GState Object 
     pgState = New PdfGState() 

     pcbContent.SetGState(pgState) 'Set Current State 
     pcbContent.SetColorFill(BaseColor.BLACK) 'Fill Letters 
     pcbContent.BeginText() 'Start Text Replace Procedure 
     pcbContent.SetTextMatrix(rctRect.Left, rctRect.Bottom) 'Get Text Location 

     'Set New Font And Size 
     pcbContent.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA_OBLIQUE, BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 12) 
     pcbContent.ShowText(strReplace) 'Replacing Text 
     pcbContent.EndText() 'Stop Text Replace Procedure 
     pcbContent.EndLayer() 'Stop Layer replace Procedure 

    Next 'rectangle 
End Sub 

乾杯。

+0

你在找VB的解決方案嗎?我可以用C#表達我的想法... – mkl

回答

0

的基本思路(僞代碼)將

  1. 實施IEventListener/ITextExtractionStrategy
  2. 使用這個類作爲參數傳遞給PdfTextExtractor爲您的文檔的每一頁
  3. 你的類的通知文檔中的每個事件。你有興趣TextRenderInfo類型的事件(即呈現文本的頁面事件)
  4. 總結TextRenderInfo事​​件,並對其進行排序(按照邏輯讀取順序)來獲取文本的概述文件
  5. 使用正則表達式中搜索匹配您的期望性質的所有文本,該文本映射回他們從
  6. 傳來TextRenderInfo對象重建.pdf文檔的基礎上,TextRenderInfo對象,你已經收集,並且希望所代替