2010-11-09 47 views
0

嗨,我想在Java中創建一個網絡爬蟲中,我想以檢索從網頁類似標題的一些數據,描述和DATAS存儲在數據庫如何在java中創建網絡爬蟲?

+0

我喜歡HtmlUnit,但我不知道它在Android上的工作效果如何...... – MatrixFrog 2010-11-09 06:54:58

+0

告訴我如何使用HtmlUnit創建Web爬蟲。首先,我想解析一些數據並將其存儲在數據庫中。 – 2010-11-09 07:16:43

回答

2

如果你想要做自己使用在Android API附帶HttpClient

HttpClient的使用示例(您只需要分析出:

public class HttpTest { 
    public static void main(String... args) 
    throws ClientProtocolException, IOException { 
     crawlPage("http://www.google.com/"); 
    } 

    static Set<String> checked = new HashSet<String>(); 

    private static void crawlPage(String url) throws ClientProtocolException, IOException { 

     if (checked.contains(url)) 
      return; 

     checked.add(url); 

     System.out.println("Crawling: " + url); 

     HttpClient client = new DefaultHttpClient(); 
     HttpGet request = new HttpGet("http://www.google.com"); 
     HttpResponse response = client.execute(request); 

     Reader reader = null; 
     try { 
      reader = new InputStreamReader(response.getEntity().getContent()); 

      Links links = new Links(); 
      new ParserDelegator().parse(reader, links, true); 

      for (String link : links.list) 
       if (link.startsWith("http://")) 
        crawlPage(link); 

     } finally { 
      if (reader != null) { 
       try { 
        reader.close(); 
       } catch (IOException e) { 
        e.printStackTrace(); 
       } 
      } 
     } 
    } 



    static class Links extends HTMLEditorKit.ParserCallback { 

     List<String> list = new LinkedList<String>(); 

     public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 
      if (t == HTML.Tag.A) 
       list.add(a.getAttribute(HTML.Attribute.HREF).toString()); 
     } 
    } 
} 
1

您可以使用crawler4j Crawler4j是一個開源的Java爬蟲它提供了一個簡單的界面,檢索網頁,您可以設置一個。在幾個小時的多線程網絡爬蟲

0

您可以使用WebCollector:https://github.com/CrawlScript/WebCollector

演示基於WebCollector 2.05:

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler; 
import cn.edu.hfut.dmic.webcollector.model.Links; 
import cn.edu.hfut.dmic.webcollector.model.Page; 
import java.util.regex.Pattern; 
import org.jsoup.nodes.Document; 

/** 
* Crawl news from yahoo news 
* 
* @author hu 
*/ 
public class YahooCrawler extends BreadthCrawler { 

    /** 
    * @param crawlPath crawlPath is the path of the directory which maintains 
    * information of this crawler 
    * @param autoParse if autoParse is true,BreadthCrawler will auto extract 
    * links which match regex rules from pag 
    */ 
    public YahooCrawler(String crawlPath, boolean autoParse) { 
     super(crawlPath, autoParse); 
     /*start page*/ 
     this.addSeed("http://news.yahoo.com/"); 

     /*fetch url like http://news.yahoo.com/xxxxx*/ 
     this.addRegex("http://news.yahoo.com/.*"); 
     /*do not fetch url like http://news.yahoo.com/xxxx/xxx)*/ 
     this.addRegex("-http://news.yahoo.com/.+/.*"); 
     /*do not fetch jpg|png|gif*/ 
     this.addRegex("-.*\\.(jpg|png|gif).*"); 
     /*do not fetch url contains #*/ 
     this.addRegex("-.*#.*"); 
    } 

    @Override 
    public void visit(Page page, Links nextLinks) { 
     String url = page.getUrl(); 
     /*if page is news page*/ 
     if (Pattern.matches("http://news.yahoo.com/.+html", url)) { 
      /*we use jsoup to parse page*/ 
      Document doc = page.getDoc(); 

      /*extract title and content of news by css selector*/ 
      String title = doc.select("h1[class=headline]").first().text(); 
      String content = doc.select("div[class=body yom-art-content clearfix]").first().text(); 

      System.out.println("URL:\n" + url); 
      System.out.println("title:\n" + title); 
      System.out.println("content:\n" + content); 

      /*If you want to add urls to crawl,add them to nextLink*/ 
      /*WebCollector automatically filters links that have been fetched before*/ 
      /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/ 
      // nextLinks.add("http://xxxxxx.com"); 
     } 
    } 

    public static void main(String[] args) throws Exception { 
     YahooCrawler crawler = new YahooCrawler("crawl", true); 
     crawler.setThreads(50); 
     crawler.setTopN(100); 
     //crawler.setResumable(true); 
     /*start crawl with depth of 4*/ 
     crawler.start(4); 
    } 

}