2016-05-12 83 views
0

我有以下代碼,它查看我電子表格的A列中的每個單元格,搜索它在指定PDF中找到的文本,然後提取頁面在那裏它將文本視爲PDF,並用電子表格單元格中的值命名。代碼的工作原理很慢,但我可能需要在PDF中搜索多達200個字,最長可達600頁。有沒有辦法讓代碼更快?目前它循環遍歷每個單元搜索每個頁面循環遍歷每個單詞,直到它找到單元格中的單詞。Excel VBA搜索PDF和提取文本並命名頁面

Sub test_with_PDF() 

    Dim objApp As Object 
    Dim objPDDoc As Object 
    Dim objjso As Object 
    Dim wordsCount As Long 
    Dim page As Long 
    Dim i As Long 
    Dim strData As String 
    Dim strFileName As String 
    Dim lastrow As Long, c As Range 
    Dim PageNos As Integer 
    Dim newPDF As Acrobat.CAcroPDDoc 
    Dim NewName As String 
    Dim Folder As String 
    lastrow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row 

    strFileName = selectFile() 
    Folder = GetFolder() 

    Set objApp = CreateObject("AcroExch.App") 
    Set objPDDoc = CreateObject("AcroExch.PDDoc") 
    'AD.1 open file, if =false file is damage 
    If objPDDoc.Open(strFileName) Then 
     Set objjso = objPDDoc.GetJSObject 

PageNos = 0 
For Each c In Sheets("Sheet1").Range("A2:A" & lastrow) 

     For page = 0 To objPDDoc.GetNumPages - 1 
      wordsCount = objjso.GetPageNumWords(page) 
      For i = 0 To wordsCount 

       If InStr(1, c.Value, ", ") = 0 Then 

        If objjso.getPageNthWord(page, i) = c.Value Then 
         PageNos = PageNos + 1 
         If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then 

           Set newPDF = CreateObject("AcroExch.pdDoc") 
           NewName = Folder & "\" & c.Offset(0, 4) & ".pdf" 
           newPDF.Open (NewName) 
           newPDF.InsertPages lngPages, objPDDoc, page, 1, 0 
           newPDF.Save 1, NewName 
           newPDF.Close 
           Set newPDF = Nothing 
           Exit For 
         Else 
           Set newPDF = CreateObject("AcroExch.PDDoc") 
           newPDF.Create 
           NewName = Folder & "\" & c.Offset(0, 4) & ".pdf" 
           newPDF.InsertPages -1, objPDDoc, page, 1, 0 
           newPDF.Save 1, NewName 
           newPDF.Close 
           Set newPDF = Nothing 
           Exit For 

         End If 
        End If 
       Else 

       If objjso.getPageNthWord(page, i) = c.Offset(0, 1).Value Then 
        If objjso.getPageNthWord(page, i + 1) = c.Offset(0, 2).Value Then 
         PageNos = PageNos + 1 
         If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then 

           Set newPDF = CreateObject("AcroExch.pdDoc") 
           NewName = Folder & "\" & c.Offset(0, 4) & ".pdf" 
           newPDF.Open (NewName) 
           newPDF.InsertPages lngPages, objPDDoc, page, 1, 0 
           newPDF.Save 1, NewName 
           newPDF.Close 
           Set newPDF = Nothing 
           Exit For 
         Else 
           Set newPDF = CreateObject("AcroExch.PDDoc") 
           newPDF.Create 
           NewName = Folder & "\" & c.Offset(0, 4) & ".pdf" 
           newPDF.InsertPages -1, objPDDoc, page, 1, 0 
           newPDF.Save 1, NewName 
           newPDF.Close 
           Set newPDF = Nothing 
           Exit For 

         End If 
         Exit For 
        End If 
       End If 
      End If 
      Next i 
     Next page 
     c.Offset(0, 3).Value = PageNos 
     PageNos = 0 
    Next c 
    MsgBox "Done" 
    Else 
     MsgBox "error!" 
    End If 
End Sub 

Function FileExist(path As String) As Boolean 
    If Dir(path) <> vbNullString Then FileExist = True 
End Function 
Function selectFile() 
Dim fd As FileDialog, fileName As String 

On Error GoTo ErrorHandler 

Set fd = Application.FileDialog(msoFileDialogFilePicker) 

fd.AllowMultiSelect = False 

If fd.Show = True Then 
    If fd.SelectedItems(1) <> vbNullString Then 
     fileName = fd.SelectedItems(1) 
    End If 
Else 
    'Exit code if no file is selected 
    End 
End If 

'Return Selected FileName 
selectFile = fileName 

Set fd = Nothing 

Exit Function 

ErrorHandler: 
Set fd = Nothing 
MsgBox "Error " & Err & ": " & Error(Err) 

End Function 
Function GetFolder() As String 
    Dim fldr As FileDialog 
    Dim sItem As String 
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker) 
    With fldr 
     .Title = "Select the Folder where you want you new PDFs to go" 
     .AllowMultiSelect = False 
     .InitialFileName = Application.DefaultFilePath 
     If .Show <> -1 Then GoTo NextCode 
     sItem = .SelectedItems(1) 
    End With 
NextCode: 
    GetFolder = sItem 
    Set fldr = Nothing 
End Function 

非常感謝提前。

+0

顯然,您正在使用外部庫來搜索PDF文件(這是實現您想要的唯一可能的方式)。所以,代碼的速度取決於這些庫。從不同的軟件供應商處獲得更好的新版本,就完成了。簡而言之:你不需要更好/更快的VBA代碼,而是更好的外部庫。所以,本質上你的問題是「推薦更好的庫從VBA中調用」。然而,這個網站的這些問題是[off-topic](http://stackoverflow.com/help/on-topic)。 – Ralph

回答

0

對不起,發佈一個快速,不完整的答案,但我想我可以指出你在一個好方向。

與其讓系統查找數百億次的兩個術語,然後進行數百億次比較,將搜索條件放入數組中,並將每個頁面的文本轉換爲長字符串。必須做一個查詢和每頁200個比較。

'Dim your Clipboard functions 
Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long 
Public Declare PtrSafe Function EmptyClipboard Lib "user32"() As Long 
Public Declare PtrSafe Function CloseClipboard Lib "user32"() As Long 

'... 

Dim objData As New MSForms.DataObject 
Dim arrSearch() As String 
Dim strTxt As String 

'... 

'Create array of search terms 
For i = 2 To lastrow 
    arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i) 
Next i 

For page = 0 To objPDDoc.GetNumPages - 1 

    '[Move each page into a new document. You already have that code] 

    'Clear clipboard 
    OpenClipboard (0&) 
    EmptyClipboard 
    CloseClipboard 

    'Copy page to clipboard 
    objApp.MenuItemExecute ("SelectAll") 
    objApp.MenuItemExecute ("Copy") 
    'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name") 
    'You may have to insert a waiting function like sleep() here to wait for the action to complete 

    'Put data from clipboard into a string. 
    objData.GetFromClipboard 
    strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory 

    'Compare each element of the array to the string 
    For i = LBound(arrSearch) To UBound(arrSearch) 
     If InStr(1, strTxt, arrSearch(i)) > 0 Then 
      '[You found a match. Your code here] 
     End If 
    Next i 

Next page 

這仍然很麻煩,因爲您必須在新文檔中打開每個頁面。如果有一種很好的方法可以純粹通過文本來確定您所在的頁面(例如頁面底部的頁碼,然後緊接頁面頂部的頁眉),那麼您可能會考慮複製整個頁面將文本文本合併爲一個字符串,然後使用文本中的線索來決定找到匹配項後要提取哪個頁面。我相信這會快很多。

+0

謝謝!我會放棄它。 –

+0

@Emma Lavallin你試過了嗎? – jlookup

+0

我很抱歉極爲遲延的回覆,在此期間我已經搬到了新西蘭!感謝你的代碼,它確實有效,但仍然有點慢。但是,它的確激勵了我,導致我創建了下面的代碼。您必須首先將PDF中的所有文本複製並粘貼到Excel中,但付出的代價很小。 –

0
Sub BatchRenameCS() 

Dim objApp As Object 
Dim objPDDoc As Object 
Dim objjso As Object 
Dim newPDF As Acrobat.CAcroPDDoc 
Dim lastrow2 As Long 
Dim strFileName As String 
Dim Folder As String 
Dim Page As Long 
Dim Cell As Long 
Dim PDFCharacterCount() As Long 
Dim CharacterCount As Long 
Dim i As Integer 
Dim c As Integer 
Dim x As Integer 
Dim strSource As String 
Dim strResult As String 
Dim PDFCharacters As String 
Dim PDFCharacters2 As String 
Dim PDFPasteData() As String 
Dim PasteDataPage As Integer 
Dim LastRow As Long 
Dim NewName As String 
Dim NewNamePageNum As Integer 
Dim Check() 

Sheets("Sheet1").Range("C:D").ClearContents 

strFileName = selectFile() 
Folder = GetFolder() 

'create array with pdf word count 
Set objApp = CreateObject("AcroExch.App") 
Set objPDDoc = CreateObject("AcroExch.PDDoc") 
'AD.1 open file, if =false file is damage 
    If objPDDoc.Open(strFileName) Then 
     Set objjso = objPDDoc.GetJSObject 

ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long 

For Page = 1 To objPDDoc.GetNumPages 
PDFCharacters = "" 
PDFCharacters2 = "" 
    For c = 0 To objjso.GetPageNumWords(Page - 1) 
    PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c) 
    Next c 
    For i = 1 To Len(PDFCharacters) 
     Select Case Asc(Mid(PDFCharacters, i, 1)) 
      Case 48 To 57, 65 To 90, 97 To 122: 
      PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1)) 
      Case Else 
      PDFCharacters2 = PDFCharacters2 & "" 
     End Select 
    Next 
    PDFCharacterCount(Page) = Len(PDFCharacters2) 

Next Page 

lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row 
Page = 1 
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String 
For Cell = 1 To lastrow2 
    strResult = "" 
    strSource = Sheets("Sheet2").Cells(Cell, 1).Text 
    PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource 
    For i = 1 To Len(strSource) 
     Select Case Asc(Mid(strSource, i, 1)) 
      Case 48 To 57, 65 To 90, 97 To 122: 
      strResult = strResult & (Mid(strSource, i, 1)) 
      Case Else 
      strResult = strResult & "" 
     End Select 
    Next 

CharacterCount = CharacterCount + Len(strResult) 

If CharacterCount = PDFCharacterCount(Page) Then 
CharacterCount = 0 
Page = Page + 1 
End If 

Next Cell 
ReDim Check(2, objPDDoc.GetNumPages) 
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row 
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow) 
    For PasteDataPage = 1 To objPDDoc.GetNumPages 
     If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then 
     Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1 
     Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10) 
           If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then 

             Set newPDF = CreateObject("AcroExch.pdDoc") 
             NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf" 
             newPDF.Open (NewName) 
             newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0 
             newPDF.Save 1, NewName 
             newPDF.Close 
             Set newPDF = Nothing 
           Else 
             Set newPDF = CreateObject("AcroExch.PDDoc") 
             newPDF.Create 
             NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf" 
             newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0 
             newPDF.Save 1, NewName 
             newPDF.Close 
             Set newPDF = Nothing 
           End If 
     End If 
    Next PasteDataPage 
Next LookUpCell 
x = 1 
For PasteDataPage = 1 To objPDDoc.GetNumPages 
    If Check(1, PasteDataPage) <> 1 Then 
    Sheets("Sheet1").Cells(x, 3) = PasteDataPage 
    Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage) 
    x = x + 1 
    End If 
Next PasteDataPage 
End If 
MsgBox "Done" 
End Sub 
Function FileExist(path As String) As Boolean 
    If Dir(path) <> vbNullString Then FileExist = True 
End Function 
Function selectFile() 
Dim fd As FileDialog, fileName As String 
On Error GoTo ErrorHandler 
Set fd = Application.FileDialog(msoFileDialogFilePicker) 
fd.AllowMultiSelect = False 
If fd.Show = True Then 
    If fd.SelectedItems(1) <> vbNullString Then 
     fileName = fd.SelectedItems(1) 
    End If 
Else 
    'Exit code if no file is selected 
    End 
End If 
'Return Selected FileName 
selectFile = fileName 
Set fd = Nothing 
Exit Function 
ErrorHandler: 
Set fd = Nothing 
MsgBox "Error " & Err & ": " & Error(Err) 
End Function 
Function GetFolder() As String 
    Dim fldr As FileDialog 
    Dim sItem As String 
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker) 
    With fldr 
     .Title = "Select the Folder where you want you new PDFs to go" 
     .AllowMultiSelect = False 
     .InitialFileName = Application.DefaultFilePath 
     If .Show <> -1 Then GoTo NextCode 
     sItem = .SelectedItems(1) 
    End With 
NextCode: 
    GetFolder = sItem 
    Set fldr = Nothing 

End Function 
+0

請編輯更多信息。僅限代碼和「嘗試這個」的答案是不鼓勵的,因爲它們不包含可搜索的內容,也不解釋爲什麼有人應該「嘗試這個」。 – abarisone