2017-03-18 106 views
1

我有一個(非常醜陋的)方法從網站獲取頁面以及頁面上的所有圖像。獲取網頁完全沒有問題。但是當我獲取這些圖像時,它們會顯得非常奇怪,並且不會像它們發送的那樣。 我一直在用於測試的uri是這樣的:http://www.themountaingoats.net/contact.html這個網頁非常簡單,並且擁有我需要測試的所有東西。如何通過相同的套接字連接發送圖像和文本

使用\ r或\ n作爲結束字符會給出不同的結果,而\ r \ n將無法打開圖像。

public static String GET(String uri, int port) throws IOException { 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 


     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 


     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 


      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      PrintWriter imageWriter = new PrintWriter(image); 
      imageWriter.print(""); 
      imageWriter.close(); 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      boolean flag = false; 
      String previousServerSentence = "something not empty"; 
      characterCounter=100; 
      while(characterCounter > 0){ 
       String serverSentence = inFromServer.readLine(); 
       System.out.println(serverSentence); 
       if (serverSentence.startsWith("Content-Length:")){ 
        characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
       } 

       if (!flag){ 
        if (previousServerSentence.matches("") && !serverSentence.matches("")){ 
         flag = true; 
        } 
       } 

       if ((!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
         && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
         && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ") 
         && !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ") 
         && !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ") 
         && !serverSentence.startsWith("Content-Location: ") && !serverSentence.startsWith("Content-MD5: ") && !serverSentence.startsWith("Content-Range: ") 
         && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("expect: ") 
         && !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ") 
         && !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ") 
         && !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ") 
         && !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ") 
         && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ") 
         && !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ") 
         && !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ") 
         && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ") 
         && !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ") 
         && !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: ")) 
         && flag){ 
        characterCounter = characterCounter - serverSentence.length()-1; 
        //write in the file 

        FileWriter fw = new FileWriter(image.getAbsoluteFile(),true); 
        BufferedWriter bw = new BufferedWriter(fw); 
        bw.write(serverSentence+"\r"); 
        bw.close(); 


       } 

       previousServerSentence = serverSentence; 
      } 


     } 
     return null; 
    } 

enter image description here enter image description here

enter image description here

第一圖像是用於\ R作爲底線,所述第二圖像是用於\ n的底線和最後一個圖像是原來的一個。我完全不知道爲什麼圖像變得糟糕。

所以我的問題是:爲什麼會發生這種情況,我該如何解決它?

編輯:

public static String GET(String uri, int port) throws IOException { 

     /* 
     * Retrieval of the webpage 
     */ 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 

     /* 
     * Retrieval of all the embedded images on the webpage that are on the same domain. 
     */ 

     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 



     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 

      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      // Initialize the streams. 
      final FileOutputStream fileOutputStream = new FileOutputStream(image); 
      final InputStream inputStream = socket.getInputStream(); 

      // Header end flag. 
      boolean headerEnded = false; 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      int buffersize = 1000000; 
      byte[] bytes = new byte[buffersize]; 
      int length; 

      while ((length = inputStream.read(bytes)) != -1) { 
       // If the end of the header had already been reached, write the bytes to the file as normal. 
       if (headerEnded){ 

        fileOutputStream.write(bytes, 0, length); 
       } 
       // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
       // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
       // If the end of the header is reached, the flag is set to true and the remaining data in the 
       // currently buffered byte array is written into the file. 
       else { 
        for (int i = 0; i < buffersize-3; i++) { 
         if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
          headerEnded = true; 
          fileOutputStream.write(bytes, i+4 , buffersize-i-4); 
          break; 
         } 
        } 
       } 
      } 

      inputStream.close(); 
      fileOutputStream.close(); 

     } 
     socket.close(); 
     return null; 
    } 

這是我現在的結果是:圖像

​​

我可以得到一部分,但不是整個畫面。使用緩衝區大小可以讓我稍微遠一點,甚至更遠一點。

EDIT2:我發現了錯誤。它只是與一些維度有關。 最後的工作代碼:

public static String GET(String uri, int port) throws IOException { 

    /* 
    * Retrieval of the webpage 
    */ 

    String domain = uri.split("/",2)[0]; 
    String filename = uri.split("/",2)[1]; 
    Socket socket = new Socket(domain, port); 


    // send the command to the server. 
    System.out.println(socket.isConnected()); 
    DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
    BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
    String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
    System.out.println(request); 
    outToServer.writeBytes(request); 

    //create a file to write in. 
    File file = new File(domain+".txt"); 
    // if file doesnt exists, then create it 
    if (!file.exists()) { 
     file.createNewFile(); 
    } 
    PrintWriter writer = new PrintWriter(file); 
    writer.print(""); 
    writer.close(); 

    int characterCounter=100; 
    while(characterCounter >= 0){ 
     String serverSentence = inFromServer.readLine(); 
     System.out.println(serverSentence); 
     if (serverSentence.startsWith("Content-Length:")){ 
      characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
     } 
     if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
       && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
       && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
      characterCounter = characterCounter - serverSentence.length()-1; 
     } 

     //write in the file 
     FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
     BufferedWriter bw = new BufferedWriter(fw); 
     bw.write(serverSentence+"\r\n"); 
     bw.close(); 
    } 

    /* 
    * Retrieval of all the embedded images on the webpage that are on the same domain. 
    */ 

    Document doc = Jsoup.parse(file, "UTF-8"); 
    Elements imgs = doc.getElementsByTag("img"); 

    System.out.println(imgs); 


    for (Element link : imgs) { 

     // Getting the link ready for GET query. 

     String source = link.attr("src"); 

     source = source.replace("http://"+domain+"", ""); 

     System.out.println(source); 

     //create a file to write in. 
     File image = new File(source.replace("/", ".")); 
     // if file doesnt exists, then create it 
     if (!image.exists()) { 
      image.createNewFile(); 
     } 

     String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(requestImage); 
     outToServer.writeBytes(requestImage); 

     // Initialize the streams. 
     final FileOutputStream fileOutputStream = new FileOutputStream(image); 
     final InputStream inputStream = socket.getInputStream(); 

     // Header end flag. 
     boolean headerEnded = false; 

     int buffersize = 10000; 
     byte[] bytes = new byte[buffersize]; 
     int length; 
     while ((length = inputStream.read(bytes)) != -1) { 
      // If the end of the header had already been reached, write the bytes to the file as normal. 
      if (headerEnded){ 
       fileOutputStream.write(bytes, 0, length); 
      } 
      // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
      // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
      // If the end of the header is reached, the flag is set to true and the remaining data in the 
      // currently buffered byte array is written into the file. 
      else { 
       for (int i = 0; i < length-3; i++) { 
        if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
         headerEnded = true; 
         fileOutputStream.write(bytes, i+4 , length-i-4); 
         break; 
        } 
       } 
      } 
     } 

     inputStream.close(); 
     fileOutputStream.close(); 

    } 
    socket.close(); 
    return null; 
} 
+0

你有使用插座,與其URLConnection的? – JCasso

+0

這是一個任務,我們必須使用套接字,不能使用任何像UrlConnection檢索數據 – Sander

回答

1

避免使用原始套接字可能在處理HTTP請求。

見4ndrew的答案,如果你可以使用一個單獨的連接以檢索圖像文件: https://stackoverflow.com/a/8679160/176873

如果你被卡住原始套接字,儘量避免使用java.io.BufferedReader中。 BufferedReader不應該被用來讀取二進制數據。您正在將二進制數據轉換爲字符串並將文本文件寫入本地PC。

見Alexay的答案解決方法: https://stackoverflow.com/a/34106534/176873

+0

你的第二個鏈接似乎幾乎適用於我:我現在可以部分下載圖像。不管我設置緩衝區有多大,我只能得到圖像的前幾行。如果我只是將緩衝區大小設置得非常大,則行數量似乎會減少。更改的代碼在編輯我的問題 – Sander

+0

我接受了你的答案,因爲它使我找到了一個解決方案,儘管我仍然不得不思考一點。非常感謝你。 – Sander