2012-03-08 63 views







protected string GuessThumbnail(HtmlDocument document) 
    HtmlNode root = document.DocumentNode; 
    IEnumerable<string> result = new List<string>(); 

    HtmlNode description = root.SelectSingleNode(DescriptionPredictiveXPath); 
    if (description != null) // in this case, we predict relevant images are the ones closest to the description text node. 
     HtmlNode node = description.ParentNode; 
     while (node != null) 
      string path = string.Concat(node.XPath, ImageXPath); 
      node = node.ParentNode; 
      IEnumerable<HtmlNode> nodes = root.SelectNodesOrEmpty(path); 

      // find the image tag that's closest to the text node. 
      if (nodes.Any()) 
       var xpaths = nodes.Select(n => n.XPath); 

       // return closest 
    // figure some other way to do it 

    throw new NotImplementedException(); 

你的意思最接近的是如何接近它是文檔結構目標元素內? – JamieSee 2012-03-08 17:56:01


是的,就是這樣。我想知道'div [7]'比'div [5]'更接近'div [4]',如果有多個'div [5]',那麼檢查下一層,等等直到找到最接近的元素。 – bevacqua 2012-03-08 17:59:12


您的代碼是否使用CodePlex的Html Agility Pack? – JamieSee 2012-03-08 18:27:57




protected string GuessThumbnail(HtmlDocument document) 
     HtmlNode root = document.DocumentNode; 
     HtmlNode description = root.SelectSingleNode(DescriptionPredictiveXPath); 

     if (description != null) 
      // in this case, we predict relevant images are the ones closest to the description text node. 
      HtmlNode parent = description.ParentNode; 
      while (parent != null) 
       string path = string.Concat(parent.XPath, ImageXPath); 
       IList<HtmlNode> images = root.SelectNodesOrEmpty(path).ToList(); 

       // find the image tag that's closest to the text node. 
       if (images.Any()) 
        HtmlNode descriptionOutermost = description.ParentNodeUntil(parent); // get the first child towards the description from the parent node. 
        int descriptionIndex = descriptionOutermost.GetIndex(); // get the index of the description's outermost element. 

        HtmlNode closestToDescription = null; 
        int distanceToDescription = int.MaxValue; 

        foreach (HtmlNode image in images) 
         int index = image.ParentNodeUntil(parent).GetIndex(); // get the index of the image's outermost element. 
         if (index > descriptionIndex) 
          index *= -1; 
         int distance = descriptionIndex - index; 
         if (distance < distanceToDescription) 
          closestToDescription = image; 
          distanceToDescription = distance; 
        if (closestToDescription != null) 
         string source = closestToDescription.Attributes["src"].Value; 
         return source; 

       parent = parent.ParentNode; 
     // figure some other way to do it 

     throw new NotImplementedException(); 

public static HtmlNode ParentNodeUntil(this HtmlNode node, HtmlNode parent) 
    while (node.ParentNode != parent) 
     node = node.ParentNode; 
    return node; 
public static int GetIndex(this HtmlNode node) 
    return node.ParentNode.ChildNodes.IndexOf(node); 


如果您可以將任意數據附加到您的節點 - 直接添加它。否則有所有節點的字典來定位地圖。
