2013-10-31 46 views
0

我有一個gwt應用程序,我需要爲seo進行優化(抓取google的內容),並且我一直在嘗試很多解決方案,這些都不能滿足我們的需求的時間返回的HTML頁面),該試驗是:gwt應用程序的爬蟲需要太多時間

  1. 我想可以使用HtmlUnit作爲無頭瀏覽器抓取需要的頁面,只需大約15秒鐘獲得html內容(審計這個時機時,結果是該時序的80%被等待背景JavaScript的循環佔用「while(waitForBackgroundJavaScript> 0 & & loopCount <_maxLoopChecks)」)
  2. 一種技術,包括在谷歌請求之前抓取頁面,然後在谷歌要求時提供已保存的快照(但這個解決方案絕對不方便,因爲內容變化非常頻繁,谷歌可能認爲這是一個「CLOACKING」 )

有什麼建議嗎?

用來抓取代碼:

public class CrawlFilter implements Filter { 
    private class SyncAllAjaxController extends NicelyResynchronizingAjaxController { 
     private static final long serialVersionUID = 1L; 

    @Override 
    public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) { 
     return true; 
    } 
} 

private final Logger log = Logger.getLogger(CrawlFilter.class.getName()); 

/** 
* Special URL token that gets passed from the crawler to the servlet 
* filter. This token is used in case there are already existing query 
* parameters. 
*/ 
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_="; 
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length(); 
/** 
* Special URL token that gets passed from the crawler to the servlet 
* filter. This token is used in case there are not already existing query 
* parameters. 
*/ 
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1; 
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length(); 

private static final long _pumpEventLoopTimeoutMillis = 30000; 
private static final long _jsTimeoutMillis = 1000; 
private static final long _pageWaitMillis = 200; 
private static final int _maxLoopChecks = 2; 

private WebClient webClient; 

public void doFilter(ServletRequest request, ServletResponse response, 
        FilterChain filterChain) throws IOException, ServletException { 
    // Grab the request uri and query strings. 
    final HttpServletRequest httpRequest = (HttpServletRequest) request; 
    final String requestURI = httpRequest.getRequestURI(); 
    final String queryString = httpRequest.getQueryString(); 
    final HttpServletResponse httpResponse = (HttpServletResponse) response; 

    if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) { 
     final int port = httpRequest.getServerPort(); 
     final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString); 
     final String scheme = httpRequest.getScheme(); 
     final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment); 
     final WebRequest webRequest = new WebRequest(urlWithHashFragment); 

     log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString()); 

     httpResponse.setContentType("text/html;charset=UTF-8"); 
     final PrintWriter out = httpResponse.getWriter(); 
     out.println(renderPage(webRequest)); 
     out.flush(); 
     out.close(); 

     log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString()); 
    } else { 
     filterChain.doFilter(request, response); 
    } 
} 

@Override 
public void destroy() { 
    if (webClient != null) { 
     webClient.closeAllWindows(); 
    } 
} 

@Override 
public void init(FilterConfig config) throws ServletException { 
} 

private StringBuilder renderPage(WebRequest webRequest) throws IOException { 
    webClient = new WebClient(BrowserVersion.FIREFOX_17); 
    webClient.getCache().clear(); 
    webClient.getOptions().setCssEnabled(false); 
    webClient.getOptions().setJavaScriptEnabled(true); 
    webClient.getOptions().setThrowExceptionOnScriptError(false); 
    webClient.getOptions().setRedirectEnabled(false); 
    webClient.setAjaxController(new SyncAllAjaxController()); 
    webClient.setCssErrorHandler(new SilentCssErrorHandler()); 

    final HtmlPage page = webClient.getPage(webRequest); 
    webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis); 

    int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis); 
    int loopCount = 0; 

    while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) { 
     ++loopCount; 
     waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis); 

     if (waitForBackgroundJavaScript == 0) { 
      log.fine("HtmlUnit exits background javascript at loop counter " + loopCount); 
      break; 
     } 

     synchronized (page) { 
      log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount); 
      try { 
       page.wait(_pageWaitMillis); 
      } catch (InterruptedException e) { 
       log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e); 
      } 
     } 
    } 

    webClient.getAjaxController().processSynchron(page, webRequest, false); 
    if (webClient.getJavaScriptEngine().isScriptRunning()) { 
     log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()"); 
     webClient.getJavaScriptEngine().shutdownJavaScriptExecutor(); 
    } 

    final String staticSnapshotHtml = page.asXml(); 
    StringBuilder stringBuilder = new StringBuilder(); 
    stringBuilder.append("<hr />\n"); 
    stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\""); 
    stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>"); 
    stringBuilder.append("<hr />"); 
    stringBuilder.append(staticSnapshotHtml); 

    return stringBuilder; 
} 

/** 
* Maps from the query string that contains _escaped_fragment_ to one that 
* doesn't, but is instead followed by a hash fragment. It also unescapes any 
* characters that were escaped by the crawler. If the query string does not 
* contain _escaped_fragment_, it is not modified. 
* 
* @param queryString 
* @return A modified query string followed by a hash fragment if applicable. 
*   The non-modified query string otherwise. 
* @throws UnsupportedEncodingException 
*/ 
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException { 
    int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2); 
    int length = ESCAPED_FRAGMENT_LENGTH2; 

    if (index == -1) { 
     index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1); 
     length = ESCAPED_FRAGMENT_LENGTH1; 
    } 

    if (index != -1) { 
     StringBuilder queryStringSb = new StringBuilder(); 
     if (index > 0) { 
      queryStringSb.append("?"); 
      queryStringSb.append(queryString.substring(0, index)); 
     } 
     queryStringSb.append("#!"); 
     queryStringSb.append(URLDecoder.decode(queryString.substring(index 
       + length, queryString.length()), "UTF-8")); 
     return queryStringSb.toString(); 
    } 

    return queryString; 
} 
} 
+0

您的內容多久改變一次?我使用上述兩種技術的組合,對我來說效果很好。您可以讓低優先級後臺線程每天多次重新生成快照。我不明白爲什麼它需要15秒,在我的情況下,它只需要少於5秒。也許你可以發佈你的代碼。 –

+4

這個問題似乎是脫離主題,因爲它是關於SEO –

+0

@JohnConde它與SEO有關,但它是關於實現基於Ajax的Webapps的快照生成,這實際上是關於編程imo。 –

回答

0
我建議有產生的HtmlUnit的靜態html離線

。您控制更新頻率。

然後,讓你的servlet過濾攔截爬蟲請求返回已經生成的靜態html。