2017-04-06 32 views
1

我正在用Java寫一個蜘蛛程序,並遇到了一些處理URL重定向的麻煩。目前有兩種URL重定向,第一種是HTTP響應代碼3xx,我可以遵循this answer通過JavaScript重定向抓取

但第二類是與網頁服務器返回的HTTP響應代碼200包含這樣只有一些JavaScript代碼:

<html> 
<head> 
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 
<script> 
function detectmob() { 
    var u=(document.URL); 
    if(navigator.userAgent.match(/Android/i) || some other browser...){ 
     window.location.href="web/mobile/index.php"; 
    } else { 
     window.location.href="web/desktop/index.php"; 
    } 
} 

detectmob(); 
</script> 
</head> 
<body></body></html> 

如果原始URL是http://example.com,然後它會自動重定向到http://example.com/web/desktop/index.php如果我使用啓用了JavaScript的桌面Web瀏覽器。

然而,我的蜘蛛檢查HttpURLConnection#getResponseCode(),看它是否已被越來越HTTP response code 200,並使用URLConnection#getHeaderField()如果接收HTTP response code 3xx得到Location場打進決賽URL。以下是我的蜘蛛的代碼片段:

public String getFinalUrl(String originalUrl) { 
     try { 
      URLConnection con = new URL(originalUrl).openConnection(); 
      HttpURLConnection hCon = (HttpURLConnection) con; 
      hCon.setInstanceFollowRedirects(false); 
      if(hCon.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM 
        || hCon.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { 
       System.out.println("redirected url: " + con.getHeaderField("Location")); 
       return getFinalUrl(con.getHeaderField("Location")); 
      } 
     } catch (IOException ex) { 
      System.err.println(ex.toString()); 
     } 

     return originalUrl; 
    } 

因此讓上述網頁將有HTTP response code 200和我的蜘蛛會只是假設不會有進一步的重定向和開始分析這是在長期空頁的內容文字。

我有谷歌這個問題有點和顯然javax.script是某種相關,但我不知道如何使它的作品。我如何編程我的蜘蛛,以便它能夠獲得正確的URL?

回答

0

這是一個解決方案,它使用Apache HttpClient處理響應代碼重定向,Jsoup從html中提取javascript,然後使用正則表達式從多種方式獲取重定向字符串重定向可以在javascript中執行。

package com.yourpackage; 

import java.io.BufferedReader; 
import java.io.IOException; 
import java.io.InputStreamReader; 
import java.io.StringWriter; 
import java.net.MalformedURLException; 
import java.net.URL; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 

import org.apache.http.HttpResponse; 
import org.apache.http.client.HttpClient; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.impl.client.HttpClientBuilder; 
import org.jsoup.Jsoup; 
import org.jsoup.helper.StringUtil; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 

import com.google.common.base.Joiner; 
import com.google.common.net.HttpHeaders; 

public class CrawlHelper { 

    /** 
    * Get end contents of a urlString. Status code is not checked here because 
    * org.apache.http.client.HttpClient effectively handles the 301 redirects. 
    * 
    * Javascript is extracted using Jsoup, and checked for references to 
    * &quot;window.location.replace&quot;. 
    * 
    * @param urlString Url. &quot;http&quot; will be prepended if https or http not already there. 
    * @return Result after all redirects, including javascript. 
    * @throws IOException 
    */ 
    public String getResult(final String urlString) throws IOException { 
    String html = getTextFromUrl(urlString); 
    Document doc = Jsoup.parse(html); 
    for (Element script : doc.select("script")) { 
     String potentialURL = getTargetLocationFromScript(urlString, script.html()); 
     if (potentialURL.indexOf("/") == 0) { 
     potentialURL = Joiner.on("").join(urlString, potentialURL); 
     } 
     if (!StringUtil.isBlank(potentialURL)) { 
     return getTextFromUrl(potentialURL); 
     } 
    } 
    return html; 
    } 

    /** 
    * 
    * @param urlString Will be prepended if the target location doesn't start with &quot;http&quot;. 
    * @param js Javascript to scan. 
    * @return Target that matches window.location.replace or window.location.href assignments. 
    * @throws IOException 
    */ 
    String getTargetLocationFromScript(String urlString, String js) throws IOException { 
    String potentialURL = getTargetLocationFromScript(js); 
    if (potentialURL.indexOf("http") == 0) { 
     return potentialURL; 
    } 
    return Joiner.on("").join(urlString, potentialURL); 
    } 

    String getTargetLocationFromScript(String js) throws IOException { 
    int i = js.indexOf("window.location.replace"); 
    if (i > -1) { 
     return getTargetLocationFromLocationReplace(js); 
    } 
    i = js.indexOf("window.location.href");  
    if (i > -1) { 
     return getTargetLocationFromHrefAssign(js); 
    } 
    return ""; 
    } 

    private String getTargetLocationFromHrefAssign(String js) { 
    return findTargetFrom("window.location.href\\s?=\\s?\\\"(.+)\\\"", js); 
    } 

    private String getTargetLocationFromLocationReplace(String js) throws IOException { 
    return findTargetFrom("window.location.replace\\(\\\"(.+)\\\"\\)", js); 
    } 

    private String findTargetFrom(String regex, String js) { 
    Pattern p = Pattern.compile(regex); 
    Matcher m = p.matcher(js); 
    while (m.find()) { 
     String potentialURL = m.group(1); 
     if (!StringUtil.isBlank(potentialURL)) { 
     return potentialURL; 
     } 
    } 
    return ""; 
    } 

    private String getTextFromUrl(String urlString) throws IOException { 
    if (StringUtil.isBlank(urlString)) { 
     throw new IOException("Supplied URL value is empty."); 
    } 
    String httpUrlString = prependHTTPifNecessary(urlString); 
    HttpClient client = HttpClientBuilder.create().build(); 
    HttpGet request = new HttpGet(httpUrlString); 
    request.addHeader("User-Agent", HttpHeaders.USER_AGENT); 
    HttpResponse response = client.execute(request); 
    try (BufferedReader rd = 
     new BufferedReader(new InputStreamReader(response.getEntity().getContent()))) { 
     StringWriter result = new StringWriter(); 
     String line = ""; 
     while ((line = rd.readLine()) != null) { 
     result.append(line); 
     } 
     return result.toString(); 
    } 
    } 

    private String prependHTTPifNecessary(String urlString) throws IOException { 
    if (urlString.indexOf("http") != 0) { 
     return Joiner.on("://").join("http", urlString); 
    } 
    return validateURL(urlString); 
    } 

    private String validateURL(String urlString) throws IOException { 
    try { 
     new URL(urlString); 
    } catch (MalformedURLException mue) { 
     throw new IOException(mue); 
    } 
    return urlString; 
    } 
} 

TDD ...修改/增強,以配合不同的場景:

package com.yourpackage; 

import java.io.IOException; 

import org.junit.Assert; 
import org.junit.Test; 

public class CrawlHelperTest { 

    @Test 
    public void testRegex() throws IOException { 
    String targetLoc = 
    new CrawlHelper().getTargetLocationFromScript("somesite.com", "function goHome() { window.location.replace(\"/s/index.html\")}"); 
    Assert.assertEquals("somesite.com/s/index.html", targetLoc); 
    targetLoc = 
     new CrawlHelper().getTargetLocationFromScript("window.location.href=\"web/mobile/index.php\";"); 
    Assert.assertEquals("web/mobile/index.php", targetLoc); 
    } 

    @Test 
    public void testCrawl() throws IOException { 
    Assert.assertTrue(new CrawlHelper().getResult("somesite.com").indexOf("someExpectedContent") > -1); 
    } 

}