2010-12-07 28 views
19

我需要一個功能強大的Web刮板庫來從Web上挖掘內容。這可以是免費的或者免費的,對我來說都可以。請爲我提供一個圖書館或更好的方式來挖掘數據並存儲在我的首選數據庫中。我已經搜索,但我沒有找到任何好的解決方案。我需要專家的好建議。請幫助我。我需要一個功能強大的Web刮板庫

+7

只是一句警告:當刮取內容時,被刮掉的網站可以在不發出警告的情況下徹底改變它們的HTML。有一天,你有滿足你期望的內容;第二天他們用DIV或其他東西代替了HTML表格。制定應急計劃是一個不錯的主意,並且能夠快速修改你正在挖掘的方式。 – 2010-12-07 14:26:26

回答

16

刮擦真的很容易,你只需要解析你正在下載的內容並獲得所有相關的鏈接。

儘管最重要的部分是處理HTML的部分。因爲大多數瀏覽器不需要最清晰的(或符合標準的)HTML來呈現,所以您需要一個HTML分析器,它將能夠理解並非總是格式正確的HTML。

我建議你爲此使用HTML Agility Pack。它在處理非格式良好的HTML方面做得非常好,併爲您提供了一個簡單的界面,讓您可以使用XPath查詢獲取生成文檔​​中的節點。除了這些之外,你只需要選擇一個數據存儲來存放處理過的數據(你可以使用任何數據庫技術)並從網上下載內容,而.NET提供了兩種高級機制,WebClientHttpWebRequest/HttpWebResponse類。

+7

請爲了愛。不要將WebClient/HttpWebRequest/HttpWebResponse用於需要強大Web刮板的人!他最好只寫一些套接字並加載所有數據。 – CodingBarfield 2010-12-07 14:13:02

1

我的建議是:

你可以看看周圍的HTML解析器,然後用它來解析出從站點的信息。 (如here)。然後,您需要做的就是將數據保存到數據庫中,但您認爲合適。

我已經做了幾次自己的刮板,這很容易,並允許您自定義保存的數據。

數據挖掘工具

如果你真的只想得到一個工具來做到這一點,那麼你應該沒有問題finding some

2

對於簡單的網站(僅適用於純html),Mechanize工作得非常好,速度也很快。對於使用Javascript,AJAX甚至Flash的網站,您需要一個真正的瀏覽器解決方案,例如iMacros。

6
using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 

namespace SoftCircuits.Parsing 
{ 
    public class HtmlTag 
    { 
     /// <summary> 
     /// Name of this tag 
     /// </summary> 
     public string Name { get; set; } 

     /// <summary> 
     /// Collection of attribute names and values for this tag 
     /// </summary> 
     public Dictionary<string, string> Attributes { get; set; } 

     /// <summary> 
     /// True if this tag contained a trailing forward slash 
     /// </summary> 
     public bool TrailingSlash { get; set; } 

     /// <summary> 
     /// Indicates if this tag contains the specified attribute. Note that 
     /// true is returned when this tag contains the attribute even when the 
     /// attribute has no value 
     /// </summary> 
     /// <param name="name">Name of attribute to check</param> 
     /// <returns>True if tag contains attribute or false otherwise</returns> 
     public bool HasAttribute(string name) 
     { 
      return Attributes.ContainsKey(name); 
     } 
    }; 

    public class HtmlParser : TextParser 
    { 
     public HtmlParser() 
     { 
     } 

     public HtmlParser(string html) : base(html) 
     { 
     } 

     /// <summary> 
     /// Parses the next tag that matches the specified tag name 
     /// </summary> 
     /// <param name="name">Name of the tags to parse ("*" = parse all tags)</param> 
     /// <param name="tag">Returns information on the next occurrence of the specified tag or null if none found</param> 
     /// <returns>True if a tag was parsed or false if the end of the document was reached</returns> 
     public bool ParseNext(string name, out HtmlTag tag) 
     { 
      // Must always set out parameter 
      tag = null; 

      // Nothing to do if no tag specified 
      if (String.IsNullOrEmpty(name)) 
       return false; 

      // Loop until match is found or no more tags 
      MoveTo('<'); 
      while (!EndOfText) 
      { 
       // Skip over opening '<' 
       MoveAhead(); 

       // Examine first tag character 
       char c = Peek(); 
       if (c == '!' && Peek(1) == '-' && Peek(2) == '-') 
       { 
        // Skip over comments 
        const string endComment = "-->"; 
        MoveTo(endComment); 
        MoveAhead(endComment.Length); 
       } 
       else if (c == '/') 
       { 
        // Skip over closing tags 
        MoveTo('>'); 
        MoveAhead(); 
       } 
       else 
       { 
        bool result, inScript; 

        // Parse tag 
        result = ParseTag(name, ref tag, out inScript); 
        // Because scripts may contain tag characters, we have special 
        // handling to skip over script contents 
        if (inScript) 
         MovePastScript(); 
        // Return true if requested tag was found 
        if (result) 
         return true; 
       } 
       // Find next tag 
       MoveTo('<'); 
      } 
      // No more matching tags found 
      return false; 
     } 

     /// <summary> 
     /// Parses the contents of an HTML tag. The current position should be at the first 
     /// character following the tag's opening less-than character. 
     /// 
     /// Note: We parse to the end of the tag even if this tag was not requested by the 
     /// caller. This ensures subsequent parsing takes place after this tag 
     /// </summary> 
     /// <param name="reqName">Name of the tag the caller is requesting, or "*" if caller 
     /// is requesting all tags</param> 
     /// <param name="tag">Returns information on this tag if it's one the caller is 
     /// requesting</param> 
     /// <param name="inScript">Returns true if tag began, and did not end, and script 
     /// block</param> 
     /// <returns>True if data is being returned for a tag requested by the caller 
     /// or false otherwise</returns> 
     protected bool ParseTag(string reqName, ref HtmlTag tag, out bool inScript) 
     { 
      bool doctype, requested; 
      doctype = inScript = requested = false; 

      // Get name of this tag 
      string name = ParseTagName(); 

      // Special handling 
      if (String.Compare(name, "!DOCTYPE", true) == 0) 
       doctype = true; 
      else if (String.Compare(name, "script", true) == 0) 
       inScript = true; 

      // Is this a tag requested by caller? 
      if (reqName == "*" || String.Compare(name, reqName, true) == 0) 
      { 
       // Yes 
       requested = true; 
       // Create new tag object 
       tag = new HtmlTag(); 
       tag.Name = name; 
       tag.Attributes = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase); 
      } 

      // Parse attributes 
      MovePastWhitespace(); 
      while (Peek() != '>' && Peek() != NullChar) 
      { 
       if (Peek() == '/') 
       { 
        // Handle trailing forward slash 
        if (requested) 
         tag.TrailingSlash = true; 
        MoveAhead(); 
        MovePastWhitespace(); 
        // If this is a script tag, it was closed 
        inScript = false; 
       } 
       else 
       { 
        // Parse attribute name 
        name = (!doctype) ? ParseAttributeName() : ParseAttributeValue(); 
        MovePastWhitespace(); 
        // Parse attribute value 
        string value = String.Empty; 
        if (Peek() == '=') 
        { 
         MoveAhead(); 
         MovePastWhitespace(); 
         value = ParseAttributeValue(); 
         MovePastWhitespace(); 
        } 
        // Add attribute to collection if requested tag 
        if (requested) 
        { 
         // This tag replaces existing tags with same name 
         if (tag.Attributes.ContainsKey(name)) 
          tag.Attributes.Remove(name); 
         tag.Attributes.Add(name, value); 
        } 
       } 
      } 
      // Skip over closing '>' 
      MoveAhead(); 

      return requested; 
     } 

     /// <summary> 
     /// Parses a tag name. The current position should be the first character of the name 
     /// </summary> 
     /// <returns>Returns the parsed name string</returns> 
     protected string ParseTagName() 
     { 
      int start = Position; 
      while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '>') 
       MoveAhead(); 
      return Substring(start, Position); 
     } 

     /// <summary> 
     /// Parses an attribute name. The current position should be the first character 
     /// of the name 
     /// </summary> 
     /// <returns>Returns the parsed name string</returns> 
     protected string ParseAttributeName() 
     { 
      int start = Position; 
      while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '>' && Peek() != '=') 
       MoveAhead(); 
      return Substring(start, Position); 
     } 

     /// <summary> 
     /// Parses an attribute value. The current position should be the first non-whitespace 
     /// character following the equal sign. 
     /// 
     /// Note: We terminate the name or value if we encounter a new line. This seems to 
     /// be the best way of handling errors such as values missing closing quotes, etc. 
     /// </summary> 
     /// <returns>Returns the parsed value string</returns> 
     protected string ParseAttributeValue() 
     { 
      int start, end; 
      char c = Peek(); 
      if (c == '"' || c == '\'') 
      { 
       // Move past opening quote 
       MoveAhead(); 
       // Parse quoted value 
       start = Position; 
       MoveTo(new char[] { c, '\r', '\n' }); 
       end = Position; 
       // Move past closing quote 
       if (Peek() == c) 
        MoveAhead(); 
      } 
      else 
      { 
       // Parse unquoted value 
       start = Position; 
       while (!EndOfText && !Char.IsWhiteSpace(c) && c != '>') 
       { 
        MoveAhead(); 
        c = Peek(); 
       } 
       end = Position; 
      } 
      return Substring(start, end); 
     } 

     /// <summary> 
     /// Locates the end of the current script and moves past the closing tag 
     /// </summary> 
     protected void MovePastScript() 
     { 
      const string endScript = "</script"; 

      while (!EndOfText) 
      { 
       MoveTo(endScript, true); 
       MoveAhead(endScript.Length); 
       if (Peek() == '>' || Char.IsWhiteSpace(Peek())) 
       { 
        MoveTo('>'); 
        MoveAhead(); 
        break; 
       } 
      } 
     } 
    } 
}