2016-06-13 60 views
2

我對VBA和HTML/XHTML非常陌生,但通過在線研究和其他精彩成員的幫助,我已經設法編寫了一個代碼來提取我想要的數據。由於XHTML是我想要的元素,所以我很難識別這些元素的ID,所以我認爲這是我最糟糕的地方。XHTML網站掠奪指導

網站:http://www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=

這是我想要的代碼做: 拉銀行名稱,地址,電話號碼,總存款和總資產 - 定的銀行名稱和城市我在我的Excel表中提供。

這裏是我的代碼:

Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long) 
Sub CommunityBanks() 
    Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long 
    Dim beginTime As Date, i As Long, myvalue As Variant 

Set IE = CreateObject("internetexplorer.application") 
IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX" 
IE.Visible = True 

Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE 
    DoEvents 
Loop 

'input bank name into form 
'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search") 
'Range("F3").Value = myvalue 
'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas" 
'click find button 
'IE.document.getelementbyid("MainContent_btn").Click 
'Sleep 5 * 1000 
IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click 
Sleep 5 * 1000 

'total pages 
pageTotal = IE.document.getelementbyid("lsortby").innertext 
page = 0 

Do Until page = pageTotal 
    DoEvents 
    page = IE.document.getelementbyclassname("lsortby").innertext 
    With IE.document.getelementbyid("main") 
     For r = 1 To .Rows.Length - 1 
      If Not IsArray(BankName) Then 
       ReDim BankName(7, 0) As Variant 
      Else 
       ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant 
      End If 

      BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext 
     Next r 
    End With 

    If page < pageTotal Then 
     IE.document.getelementbyclassname("panelpn").Click 
     beginTime = Now 
     Application.Wait (Now + TimeValue("00:00:05")) 
    End If 
Loop 

For r = 0 To UBound(BankName, 2) 
    IE.navigate "http://www.usbanklocations.com/" & BankName(0, r) 
    Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE 
     DoEvents 
    Loop 
    'wait 5 sec. for screen refresh 
    Sleep 5 * 1000 

    With IE.document.getelementbytagname("table") 
     For i = 0 To .Rows.Length - 1 
      DoEvents 
      Select Case .Rows(i).Cells(0).innertext 
      Case "Name:" 
       BankName(1, r) = .Rows(i).Cells(1).innertext 
      Case "Location:" 
       BankName(2, r) = .Rows(i).Cells(1).innertext 
      Case "Phone:" 
       BankName(3, r) = .Rows(i).Cells(1).innertext 
      Case "Branch Deposit:" 
       BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") 
      Case "Total Assets:" 
       BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") 
      End Select 
     Next i 
    End With 
Next r 


IE.Quit 
Set IE = Nothing 

'post result on Excel cell 
Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName) 
End Sub 

預先感謝您!我將不勝感激任何幫助。

+1

The [ToS for usbanklocations.com](http://www.usbanklocations.com/terms-of-use.php)指出用戶不能'聚集,複製或複製USBANKLOCATIONS.COM上的內容 - so我敢肯定,你不應該刮他們的網站...... –

+0

由「在」,他們指的是具體到他們的網站的行動。不是用戶可以使用的內容。您可以複製/粘貼信息。 –

+0

好的 - 我通常不會參與刮鬍子的問題,只是爲了謹慎。我只是指出,如果你不知道,但如果你很高興,這是好的,那麼足夠公平。 –

回答

2

考慮它使用XHR而不是IE和基於分HTML內容解析下面的例子:

Option Explicit 

Sub Test_usbanklocations() 

    Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5 

    Set oSource = Sheets(1) 
    Set oDestination = Sheets(2) 
    oDestination.Cells.Delete 
    DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits") 
    y = 2 

    For Each oSrcRow In oSource.UsedRange.Rows 
     sName = oSrcRow.Cells(1, 1).Value 
     sCity = oSrcRow.Cells(1, 2).Value 
     sDist = oSrcRow.Cells(1, 3).Value 
     sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist 
     sUrl1 = sUrl0 
     lPage = 1 
     Do 
      sResp1 = GetXHR(sUrl1) 
      If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do 
      a1 = Split(sResp1, "<div class=""pl") 
      For i = 1 To UBound(a1) 
       a2 = Split(a1(i), "</div>", 3) 
       a3 = Split(a2(1), "<a href=""", 2) 
       a4 = Split(a3(1), """>", 2) 
       sUrl2 = "http://www.usbanklocations.com" & a4(0) 
       sResp2 = GetXHR(sUrl2) 
       a5 = Array(_ 
        GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _ 
        Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _ 
        GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _ 
        GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _ 
        GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _ 
       ) 
       DataOutput oDestination, y, a5 
       y = y + 1 
       DoEvents 
      Next 
      If InStr(sResp1, "Next Page &gt;") = 0 Then Exit Do 
      lPage = lPage + 1 
      sUrl1 = sUrl0 & "&ps=" & lPage 
      DoEvents 
     Loop 
    Next 

    MsgBox "Completed" 

End Sub 

Function GetXHR(sUrl) 

    With CreateObject("MSXML2.XMLHTTP") 
     .Open "GET", sUrl, False 
     .Send 
     GetXHR = .ResponseText 
    End With 

End Function 

Sub DataOutput(oSht, y, aValues) 

    With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1) 
     .NumberFormat = "@" 
     .Value = aValues 
    End With 

End Sub 

Function GetFragment(sText, sPatt1, sPatt2) 

    Dim a1, a2 

    a1 = Split(sText, sPatt1, 2) 
    If UBound(a1) <> 1 Then Exit Function 
    a2 = Split(a1(1), sPatt2, 2) 
    If UBound(a2) <> 1 Then Exit Function 
    GetFragment = GetInnerText(a2(0)) 

End Function 

Function EncodeUriComponent(sText) 

    Static objHtmlfile As Object 

    If objHtmlfile Is Nothing Then 
     Set objHtmlfile = CreateObject("htmlfile") 
     objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript" 
    End If 
    EncodeUriComponent = objHtmlfile.parentWindow.encode(sText) 

End Function 

Function GetInnerText(sText) 

    With CreateObject("htmlfile") 
     .Write ("<body>" & sText & "</body>") 
     GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText 
    End With 

End Function 

作爲一個例子,第一個工作表中包含的數據搜索(銀行名稱,位置和距離的細化):

source

然後,第二工作表上結果如下:

result

+0

你真棒@墨守成規!這個XHR/api方法是一個很好的基礎。非常感謝你。實際上,我只是熟悉XHR,這將是我的第一個以這種格式來看待的代碼。 我注意到大型數據集的速度要快得多。非常感謝。 –

+0

@ K.K。順便說一句,使XHR異步,你可以達到更高的速度,但代碼應該與事件一起工作。 – omegastripes

+0

@omegastripes,謝謝你的代碼。對我來說這是一項全新的技能。我從中學到了東西。 – PaichengWu