2015-05-13 43 views
0

我目前在一個VB.Net控制檯應用程序中使用下面的代碼,該應用程序需要一個文本文件的內容並提取某些信息,然後將其導出到CSV。自動化PDF到文本VB.net

所有似乎工作正常,但問題是該文件最初通過PDF(只有選項可能),我必須手動打開文件在Adobe和'保存爲文本'。

是否有自動將PDF轉換爲文本文件或讀取PDF而不是文本文件的方式。

任何指導或選項,將不勝感激

Dim iLine, iEnd, c, iField As Integer 
    Dim iSecs, iMax As Long 
    Dim sText, sTemp, sSchema As String 
    Dim sHotel, sEndDate, sMon, sPLU, sTots, sValue, sDept, sFile, sOutFile, sDesc As String 
    Dim tdate As Date 
    Dim con As New OleDbConnection("Provider=Microsoft.ACE.OLEDB.12.0; Data Source=C:\temp\TX.accdb;") 
    Dim LUse As Boolean 


    sHotel = "Unknown Hotel" 
    sEndDate = "01/01/2015" 
    sMon = "MAR" 
    sPLU = "" 
    sTots = "0" 
    sValue = "0" 
    sDept = "Unknown Dept" 
    sDesc = "" 
    LUse = True 
    sTemp = "" 
    iField = 0 
    sSchema = "Chester" 


    'Open input file 
    sFile = "c:\temp\input.txt" 
    Dim InFile As New System.IO.StreamReader(sFile) 

    'Open lookup data table 
    con.Open() 
    Dim dbAdapter As OleDbDataAdapter = New OleDbDataAdapter(_ 
     "SELECT * FROM Plookup", con) 
    Dim dsTX As DataSet = New DataSet() 
    Dim changes As DataTable 
    Dim cmdbuilder As OleDbCommandBuilder = New OleDbCommandBuilder(dbAdapter) 

    dbAdapter.FillSchema(dsTX, SchemaType.Source, "Plookup") 
    dbAdapter.Fill(dsTX, "Plookup") 

    Dim rstx As DataTable = dsTX.Tables(0) 
    iMax = rstx.Rows.Count 
    Dim productrow() As Data.DataRow 





    'Open Output file 
    iSecs = Timer 
    sOutFile = "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".csv" 
    FileCopy(sFile, "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".txt") 
    Dim OutFile As New System.IO.StreamWriter(sOutFile) 
    'Write header 
    OutFile.WriteLine("outlet,dept,epos,tots sold,total price,date of sales") 

    iLine = 0 
    Do While InFile.Peek() <> -1 
     'Read in text 
     iLine = iLine + 1 

     sText = InFile.ReadLine 
     sText = sText.Replace(",", "") 
     If Len(sText) > 2 And Len(sText) < 9 Then 
      If Mid$(sText, 3, 1) = "-" Then ' Department Name 
       sText = sText & Space(9 - Len(sText)) 
      End If 
     End If 


     'Process all rows except header row - read data into array 
     If Len(sText) > 8 Then 
      Select Case Left(sText, 7) 

       Case "Consoli" ' Ignore 

       Case "Quanti " ' Ignore 

       Case "Group b" ' Ignore - but next row is the Hotel Name 
        iLine = iLine + 1 
        sText = InFile.ReadLine 
        sText = sText.Replace(",", "") 
        sHotel = Trim$(Left(sText, 20)) 'The username follows so we may truncate the hotel name 

       Case "Date ra" ' End date 

        sEndDate = Mid$(sText, 29, 2) & "/" & Mid$(sText, 32, 2) & "/" & Mid$(sText, 35, 4) 
        tdate = CDate(sEndDate).AddDays(-1) 

        sEndDate = tdate.ToString("dd/MM/yyyy") 

       Case Else 'Possible Code 

        If Mid$(sText, 3, 1) = "-" Then ' Department Name 
         sDept = Trim(sText) 
        Else 
         If IsNumeric(Left(sText, 7)) Then 'Got a code 
          sPLU = Trim(Str(Val(Left(sText, 7)))) 
          'We don't know where the description ends as it contains spaces 
          'So best way is to start at the end and work back... 
          iEnd = Len(sText) 
          iField = 0 
          For c = iEnd To 9 Step -1 
           If Not (Mid(sText, c, 1) = " ") Or iField > 10 Then 

            sTemp = Mid(sText, c, 1) & sTemp 

           Else 
            iField = iField + 1 
            If iField = 9 Then 
             sValue = sTemp 

            ElseIf iField = 11 Then 
             sTots = sTemp 

            End If 
            sTemp = "" 
           End If 

          Next 

          If iField = 10 Then 
           sTots = Trim(sTemp) 
           sDesc = "" 
          Else 
           sDesc = Trim$(sTemp) 
          End If 

          'lookup code 
          productrow = rstx.Select("FileID = 'Chester' and PLU = '" & sPLU & "'") 
          If productrow.Length = 0 Then ' product not found 
           iMax = iMax + 1 
           rstx.Rows.Add(sSchema, sPLU, sDesc, False) 

           LUse = True 
          Else 
           LUse = Not productrow(0)("Exclude") 
          End If 


          If (Val(sTots) + Val(sValue) > 0) And LUse Then ' We have a non-zero sale or value and it is not excluded 
           OutFile.WriteLine(sHotel & "," & sDept & "," & sPLU & "," & sTots & "," & sValue & "," & sEndDate) 
          End If 
         End If 
        End If 


      End Select 
     End If 
    Loop 

    'dbAdapter.Update(dsTX.Tables(0)) 
    'Close input/output csv files 
    'rstx.Rows.Add("303030", "Another Test", False) 
    dbAdapter.UpdateCommand = cmdbuilder.GetUpdateCommand(True) 
    dbAdapter.InsertCommand = cmdbuilder.GetInsertCommand(True) 
    dbAdapter.DeleteCommand = cmdbuilder.GetDeleteCommand() 
    changes = rstx.GetChanges() 

    If changes IsNot Nothing Then dbAdapter.Update(changes) 


    InFile.Close() 
    OutFile.Close() 
    con.Close() 
+0

我相信有很多庫將PDF轉換爲文本..一個簡單的谷歌搜索給了我很少..例如:http://www.squarepdf.net/how-to-convert-pdf-to-text -in-net-vb你有沒有試過任何庫? –

+0

我還沒有嘗試過任何圖書館,因爲我正試圖查看它是否可以在沒有它們的情況下完成。我會看看你詳細介紹的這個鏈接。謝謝 – elmonko

回答

-1

嘗試iTextSharp的。 itextSharp是一個.NET DLL,藉助它可以從PDF中提取內容。點擊here以供參考&示例代碼(雖然代碼在c#中,它只是一個給你一個想法的參考)。