2013-10-10 22 views
1

我正在嘗試從網頁讀取源代碼。我的Java代碼從java中的網頁讀取源代碼

import java.net.*; 
import java.io.*; 
import java.util.*; 
import javax.swing.JOptionPane; 

class Testing{ 
public static void Connect() throws Exception{ 


    URL url = new URL("http://excite.com/education"); 
    URLConnection spoof = url.openConnection(); 


    spoof.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0; H010818)"); 
    BufferedReader in = new BufferedReader(new InputStreamReader(spoof.getInputStream())); 
    String strLine = ""; 


    while ((strLine = in.readLine()) != null){ 


    System.out.println(strLine); 
    } 

    System.out.println("End of page."); 
} 

public static void main(String[] args){ 

    try{ 

    Connect(); 
    }catch(Exception e){ 

    } 
} 

當我編譯並運行這段代碼,它提供了以下的輸出:

I�%&/m�{J�J��t� $ @ IG#)* EVE F的@흼{{; N'\fdlJɞ〜|「〜$}>?!? 47N +ӲMN?JtZfMGjR!9?> JgEGe [ ⳏ W ? 8 |8 ho 0' |փ: - | L Uο ׫ m zt n3 l\ w O^f G[ CG<y6KgMrgǟyEִyh~ؗ˲XL =ڢZ /(կ^ OUU6 & 6_ @yC}�p�y���lAH�ͯ��zF#�V�6_��}��)�v=J+�$��̤�G�Y�L�b���wS"�7�y^����Z�m���Y:ɛ���J<N_�Y=���U�f���,���y�Q2(J٩P!ͨ�i����1&F0&ૼn�?�x�T��h�Qzw�+����n�)�h��K��2����8g����⮥��A0 ���1I�%����Q�Z����{��������w���?x����N�?�<d�S��۫�%a|4�j��z���k�Bak��k-�c�z�g��z���l> ֎s ^, 5 /B { ]] Ý ֳ y { _l 8g k ӫ b 「+ | ( M^[ J�P��_�..?������x�Z�$ E> 느 u E〜 {媘 f e1ͷ QZ , f e 3Jٻb^4 4 >y ; < { l ZfW S @ { ] 1 Q n[�,t�?����~�n�S�u#SL��n�^��������EC��q�/�y���FE�tpm������e&��oB���z9eY��������P��IK?����̦����w�N��;�;J?����;�/��5���M���rZ��q��]��C�dᖣ��F�nd���}���A5���M�5�.�:��/�_D�?�3����'�c�Z7��}��(OI),ۏi����{�<�w�������DZ?e����'q���eY]=���kj���߬������\qhrRn���l�o-��.���k��_���oD8��GA�P�r��|$��ȈPv~Y�:�[q?�sH�� <��C��ˬ�^N�[ v(��S��l�c�C����3���E5&5�VӪL�T��۔���oQrĈ��/���#[f�5�5"� [ t vm \ .0 nh aڌWYM ^T | \, 퓜 L u B ̌ C r % { ) ); fV ] g, > C c2 P4}^h P(%J 「} &:噢\ 5Il氪{/] LBl2I」= Y | >֏N}〜[」 ø :/)Wz3lo.5k & H [jibWWy} 5֝Q | F] KjH5} yNmgӷ ǣ > 'o 泏 < G g > - > xQM %< | u . 3 [[R] 4 E6 [] 1 * 8} NWݽ | | | | | | | | | | | | | OR 9)〜G〜߻W6 {WDO/ZuUS݄LI^> [U1o_J} @@ü// I7 |?CZT(2B〜Ç Wc5'EeFĿꇙ0Ť{W2/OYJjK />:」 _l

除了來自該目錄的網址,即「excite.com/education」,所有網址都提供了正確的源代碼,但這些網址產生了問題。

請人幫助。

在此先感謝。

+0

它可能是一個enconding問題。 –

+1

也許內容被壓縮? – david

+2

內容編碼是gzip內容編碼:gzip – Shashi

回答

4

它適合我。

private static String getWebPabeSource(String sURL) throws IOException { 
     URL url = new URL(sURL); 
     URLConnection urlCon = url.openConnection(); 
     BufferedReader in = null; 

     if (urlCon.getHeaderField("Content-Encoding") != null 
       && urlCon.getHeaderField("Content-Encoding").equals("gzip")) { 
      in = new BufferedReader(new InputStreamReader(new GZIPInputStream(
        urlCon.getInputStream()))); 
     } else { 
      in = new BufferedReader(new InputStreamReader(
        urlCon.getInputStream())); 
     } 

     String inputLine; 
     StringBuilder sb = new StringBuilder(); 

     while ((inputLine = in.readLine()) != null) 
      sb.append(inputLine); 
     in.close(); 

     return sb.toString(); 
} 
2

要讀這樣說:

private static String getUrlSource(String url) throws IOException { 
     URL url = new URL(url); 
     URLConnection urlConn = url.openConnection(); 
     BufferedReader in = new BufferedReader(new InputStreamReader(
       urlConn.getInputStream(), "UTF-8")); 
     String inputLine; 
     StringBuilder a = new StringBuilder(); 
     while ((inputLine = in.readLine()) != null) 
      a.append(inputLine); 
     in.close(); 

     return a.toString(); 
    } 

,並根據網頁設置你的編碼 - 注意這條線:

BufferedReader in = new BufferedReader(new InputStreamReader(
       urlConn.getInputStream(), "UTF-8")); 
0

首先,你必須解壓縮使用GZIPInputStream內容。然後把未壓縮的流輸入流並使用的BufferedReader

使用Apache HTTP客戶端4.1.1

Maven的依賴

<dependency> 
    <groupId>org.apache.httpcomponents</groupId> 
    <artifactId>httpclient</artifactId> 
    <version>4.1.1</version> 
</dependency> 

示例代碼解析gzip的內容閱讀。

package com.gzip.simple; 

import java.io.BufferedReader; 
import java.io.IOException; 
import java.io.InputStream; 
import java.io.InputStreamReader; 
import java.util.zip.GZIPInputStream; 

import org.apache.http.Header; 
import org.apache.http.HttpResponse; 
import org.apache.http.client.ClientProtocolException; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.impl.client.DefaultHttpClient; 

public class GZIPFetcher { 
    public static void main(String[] args) { 
     try { 

      DefaultHttpClient httpClient = new DefaultHttpClient(); 
      HttpGet getRequest = new HttpGet("http://excite.com/education"); 
      getRequest.addHeader("accept", "application/json"); 

      HttpResponse response = httpClient.execute(getRequest); 

      if (response.getStatusLine().getStatusCode() != 200) { 
       throw new RuntimeException("Failed : HTTP error code : " 
         + response.getStatusLine().getStatusCode()); 
      } 

      InputStream instream = response.getEntity().getContent(); 

      // Check whether the content-encoding is gzip or not. 
      Header contentEncoding = response 
        .getFirstHeader("Content-Encoding"); 

      if (contentEncoding != null 
        && contentEncoding.getValue().equalsIgnoreCase("gzip")) { 
       instream = new GZIPInputStream(instream); 
      } 

      BufferedReader in = new BufferedReader(new InputStreamReader(
        instream)); 

      String content; 
      System.out.println("Output from Server .... \n"); 
      while ((content = in.readLine()) != null) 
       System.out.println(content); 

      httpClient.getConnectionManager().shutdown(); 

     } catch (ClientProtocolException e) { 

      e.printStackTrace(); 

     } catch (IOException e) { 

      e.printStackTrace(); 
     } 

    } 
}