2014-04-14 55 views
1

我正在使用WebHarvest工具從幾個網站中剔除網絡數據。我已經通過了這些例子,但無法找到在網站中進行身份驗證的方法,然後從中剔除數據。任何人都可以引用示例配置來實現通過身份驗證的Web數據抓取?如何發送登錄參數然後接收主頁內容?感謝您的幫助。WebHarvest - 使用身份驗證報廢數據

回答

0

我剛剛修改了Web Harvest的一個示例(http://web-harvest.sourceforge.net/samples.php?num=4),它使用登錄憑據正常運行。您可能會收到更新的代碼並嘗試:

<?xml version="1.0" encoding="UTF-8"?> 

<config charset="ISO-8859-1"> 

    <!-- sends post request with needed login information --> 
    <http method="post" url="http://www.nytimes.com/auth/login"> 
     <http-param name="is_continue">true</http-param> 
     <http-param name="URI">http://</http-param> 
     <http-param name="OQ"></http-param> 
     <http-param name="OP"></http-param> 
     <http-param name="USERID">web-harvest</http-param> 
     <http-param name="PASSWORD">web-harvest</http-param> 
    </http> 

    <var-def name="startUrl">http://www.nytimes.com/pages/todayspaper/index.html</var-def> 

    <file action="write" path="D:/nytimes/nytimes${sys.date()}.xml" charset="UTF-8"> 
     <template> 
      <![CDATA[ <newyourk_times date="${sys.datetime("dd.MM.yyyy")}"> ]]> 
     </template> 

     <loop item="articleUrl" index="i"> 
      <!-- collects URLs of all articles from the front page --> 
      <list> 
       <xpath expression="//div[@class='story']"> 
        <html-to-xml> 
         <http url="${startUrl}"/> 
        </html-to-xml> 
       </xpath> 
      </list> 

      <!-- downloads each article and extract data from it --> 
      <body> 
       <xquery> 
        <xq-param name="doc"> 
         <var name="articleUrl"/> 
        </xq-param> 
        <xq-expression><![CDATA[ 
         declare variable $doc as node() external; 
         $doc 
        ]]></xq-expression> 
       </xquery> 
      </body> 
     </loop> 

     <![CDATA[ </newyourk_times> ]]> 
    </file> 

</config>