2010-05-05 102 views
0

我正試圖解析來自在java中xhtml接收的bing搜索引擎收到的搜索查詢。我正在使用sax XmlReader來讀取結果,但我一直在獲取錯誤。 這裏是我的代碼,這是給讀者的哈德勒:解析來自Bing的XHTML結果

import org.xml.sax.Attributes; 
import org.xml.sax.SAXException; 
import org.xml.sax.helpers.DefaultHandler; 


public class XHTMLHandler extends DefaultHandler{ 
    public XHTMLHandler() 
    { 
     super(); 
    } 
    public void startDocument() 
    { 
     System.out.println("Start document"); 
    } 
    public void endDocument() 
    { 
    System.out.println("End document"); 
    } 
    public void startElement (String uri, String name,String qName, Attributes atts) 
    { 
     if ("".equals (uri)) 
       System.out.println("Start element: " + qName); 
      else 
       System.out.println("Start element: {" + uri + "}" + name); 
    } 

    public void endElement (String uri, String name, String qName) 
    { 
    if ("".equals (uri)) 
     System.out.println("End element: " + qName); 
    else 
     System.out.println("End element: {" + uri + "}" + name); 
    } 
    public void startPrefixMapping (String prefix, String uri) 
     throws SAXException { 
    } 
    public void endPrefixMapping (String prefix) 
     throws SAXException { 
    } 



    public void characters (char ch[], int start, int length) 
     { 
     System.out.print("Characters: \""); 
     for (int i = start; i < start + length; i++) { 
      switch (ch[i]) { 
      case '\\': 
      System.out.print("\\\\"); 
      break; 
      case '"': 
      System.out.print("\\\""); 
      break; 
      case '\n': 
      System.out.print("\\n"); 
      break; 
      case '\r': 
      System.out.print("\\r"); 
      break; 
      case '\t': 
      System.out.print("\\t"); 
      break; 
      default: 
      System.out.print(ch[i]); 
      break; 
      } 
     } 
     System.out.print("\"\n"); 
     } 

} 

,這是程序本身:

import java.io.BufferedReader; 
import java.io.IOException; 
import java.io.InputStreamReader; 
import java.io.OutputStreamWriter; 
import java.net.HttpRetryException; 
import java.net.HttpURLConnection; 
import java.net.URL; 

import org.xml.sax.InputSource; 
import org.xml.sax.SAXException; 
import org.xml.sax.XMLReader; 
import org.xml.sax.helpers.XMLReaderFactory; 


public class Searching { 
    private String m_urlBingSearch = "http://www.bing.com/search?q="; 
    private HttpURLConnection m_httpCon; 
    private OutputStreamWriter m_streamWriter; 
    //private BufferedReader m_bufferReader; 
    private URL m_serverAdress; 
    private StringBuilder sb; 
    private String m_line; 
    private InputSource m_inputSrc; 
    public Searching() 
    { 

     m_httpCon = null; 
     m_streamWriter = null; 
     //m_bufferReader = null; 
     m_serverAdress = null; 
     sb = null; 
     m_line = new String(); 
    } 
    public void SearchBing(String searchPrms) throws SAXException,IOException 
    { 


      //set up connection 
      sb = new StringBuilder(); 
      sb.append(m_urlBingSearch); 
      sb.append(searchPrms); 
      m_serverAdress = new URL(sb.toString()); 
      m_httpCon = (HttpURLConnection)m_serverAdress.openConnection(); 
      m_httpCon.setRequestMethod("GET"); 
      m_httpCon.setDoOutput(true); 
      m_httpCon.setConnectTimeout(10000); 
      m_httpCon.connect(); 
      //m_streamWriter = new OutputStreamWriter(m_httpCon.getOutputStream()); 
      //m_bufferReader = new BufferedReader(new InputStreamReader(m_httpCon.getInputStream())); 
      XMLReader reader = XMLReaderFactory.createXMLReader(); 
      XHTMLHandler handle = new XHTMLHandler(); 
      reader.setContentHandler(handle); 
      reader.setErrorHandler(handle); 
      //reader.startPrefixMapping("html", "http://www.w3.org/1999/xhtml"); 
      handle.startPrefixMapping("html", "http://www.w3.org/1999/xhtml"); 
      m_inputSrc = new InputSource(m_httpCon.getInputStream()); 
      reader.parse(m_inputSrc); 
      m_httpCon.disconnect(); 


    } 
    public static void main(String [] args) throws SAXException,IOException 
    { 
     Searching s = new Searching(); 
     s.SearchBing("beatles"); 
    } 
} 

這是我的錯誤信息:

 
Exception in thread "main" java.io.IOException: Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd 
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.setupCurrentEntity(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startEntity(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startDTDEntity(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.setInputSource(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.dispatch(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.next(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLNSDocumentScannerImpl.next(Unknown Source) 
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source) 
    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source) 
    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source) 
    at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(Unknown Source) 
    at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(Unknown Source) 
    at Searching.SearchBing(Searching.java:57) 
    at Searching.main(Searching.java:65) 

有人可以幫忙嗎?我認爲這是與DTD,但我不知道熱修復它

+1

不Bing有一些你可以使用的web服務,而不是屏幕抓取他們的HTML? – voyager 2010-05-05 17:33:54

+1

@voyager:我同意。我想這可能是你所指的:http://msdn.microsoft.com/en-us/library/dd900818.aspx – 2010-05-05 17:42:51

回答

1

服務器返回的HTTP響應代碼:503網址:http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd

顯然你試圖解析使用外部實體獲取解析器的XHTML文檔。它拖入DTD外部子集,以便它可以讀取任何針對HTML特定實體的聲明,例如&nbsp;&eacute;

您從託管該DTD外部子集的w3.org服務器獲得HTTP 503,但即使您不是這樣,仍然會非常不禮貌地用每個DTD請求轟炸該服務器當你做一個刮擦。 (也許他們阻止你,因爲這個原因?)

你可以創建一個EntityResolver來返回你自己的本地DTD副本,或者一個只包含實體定義的簡化版本。或者,如果XMLReader實現支持該功能,則可以使用setFeature來關閉該選項,否則您可以要求讀者不要獲取DTD。 (例如,for Xerxes)。儘管如果文檔包含像&nbsp;這樣的非內建實體參考,您可能會遇到麻煩。

此外,由於這是一個直播網頁,正在以text/html的形式提供,尤其是因爲它來自微軟,所以假設它仍然保持良好形態可能相當樂觀!屏幕抓取通常最好使用容忍HTML怪異的解析器。但正如上面的評論所述,使用API​​在任何情況下都比屏幕抓取好得多。