2010-06-24 44 views
28

我想創建一個函數,它刪除不在白名單中的html標籤和屬性。 我有以下HTML:HTML敏捷包脫衣舞標籤不在白名單

<b>first text </b> 
<b>second text here 
     <a>some text here</a> 
<a>some text here</a> 

</b> 
<a>some twxt here</a> 

我使用的HTML敏捷包和我到目前爲止的代碼:

static List<string> WhiteNodeList = new List<string> { "b" }; 
static List<string> WhiteAttrList = new List<string> { }; 
static HtmlNode htmlNode; 
public static void RemoveNotInWhiteList(out string _output, HtmlNode pNode, List<string> pWhiteList, List<string> attrWhiteList) 
{ 

// remove all attributes not on white list 
foreach (var item in pNode.ChildNodes) 
{ 
    item.Attributes.Where(u => attrWhiteList.Contains(u.Name) == false).ToList().ForEach(u => RemoveAttribute(u)); 

} 

// remove all html and their innerText and attributes if not on whitelist. 
//pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove()); 
//pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.ParentNode.ReplaceChild(ConvertHtmlToNode(u.InnerHtml),u)); 
//pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove()); 

for (int i = 0; i < pNode.ChildNodes.Count; i++) 
{ 
    if (!pWhiteList.Contains(pNode.ChildNodes[i].Name)) 
    { 
    HtmlNode _newNode = ConvertHtmlToNode(pNode.ChildNodes[i].InnerHtml); 
    pNode.ChildNodes[i].ParentNode.ReplaceChild(_newNode, pNode.ChildNodes[i]); 
    if (pNode.ChildNodes[i].HasChildNodes && !string.IsNullOrEmpty(pNode.ChildNodes[i].InnerText.Trim().Replace("\r\n", ""))) 
    { 
    HtmlNode outputNode1 = pNode.ChildNodes[i]; 
    for (int j = 0; j < pNode.ChildNodes[i].ChildNodes.Count; j++) 
    { 
    string _childNodeOutput; 
    RemoveNotInWhiteList(out _childNodeOutput, 
      pNode.ChildNodes[i], WhiteNodeList, WhiteAttrList); 
    pNode.ChildNodes[i].ReplaceChild(ConvertHtmlToNode(_childNodeOutput), pNode.ChildNodes[i].ChildNodes[j]); 
    i++; 
    } 
    } 
    } 
} 

// Console.WriteLine(pNode.OuterHtml); 
_output = pNode.OuterHtml; 
} 

private static void RemoveAttribute(HtmlAttribute u) 
{ 
u.Value = u.Value.ToLower().Replace("javascript", ""); 
u.Remove(); 

} 

public static HtmlNode ConvertHtmlToNode(string html) 
{ 
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); 
doc.LoadHtml(html); 
if (doc.DocumentNode.ChildNodes.Count == 1) 
    return doc.DocumentNode.ChildNodes[0]; 
else return doc.DocumentNode; 
} 

我tryig實現輸出是

<b>first text </b> 
<b>second text here 
     some text here 
some text here 

</b> 
some twxt here 

這意味着我只想保留<b>標籤。
我這樣做的原因是因爲有些用戶從MS WORD粘貼粘貼到ny WYSYWYG html編輯器中。

謝謝。

回答

28

嘿,顯然我幾乎找到了答案在博客中有人做....

using System.Collections.Generic; 
using System.Linq; 
using HtmlAgilityPack; 

namespace Wayloop.Blog.Core.Markup 
{ 
    public static class HtmlSanitizer 
    { 
     private static readonly IDictionary<string, string[]> Whitelist; 

     static HtmlSanitizer() 
     { 
      Whitelist = new Dictionary<string, string[]> { 
       { "a", new[] { "href" } }, 
       { "strong", null }, 
       { "em", null }, 
       { "blockquote", null }, 
       }; 
     } 

     public static string Sanitize(string input) 
     { 
      var htmlDocument = new HtmlDocument(); 

      htmlDocument.LoadHtml(input); 
      SanitizeNode(htmlDocument.DocumentNode); 

      return htmlDocument.DocumentNode.WriteTo().Trim(); 
     } 

     private static void SanitizeChildren(HtmlNode parentNode) 
     { 
      for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) { 
       SanitizeNode(parentNode.ChildNodes[i]); 
      } 
     } 

     private static void SanitizeNode(HtmlNode node) 
     { 
      if (node.NodeType == HtmlNodeType.Element) { 
       if (!Whitelist.ContainsKey(node.Name)) { 
        node.ParentNode.RemoveChild(node); 
        return; 
       } 

       if (node.HasAttributes) { 
        for (int i = node.Attributes.Count - 1; i >= 0; i--) { 
         HtmlAttribute currentAttribute = node.Attributes[i]; 
         string[] allowedAttributes = Whitelist[node.Name]; 
         if (!allowedAttributes.Contains(currentAttribute.Name)) { 
          node.Attributes.Remove(currentAttribute); 
         } 
        } 
       } 
      } 

      if (node.HasChildNodes) { 
       SanitizeChildren(node); 
      } 
     } 
    } 
} 

I got HtmlSanitizer from here 顯然,這並不th標籤剝離,但是移除元素altoghether。

好的,這裏是稍後需要它的人的解決方案。

public static class HtmlSanitizer 
    { 
     private static readonly IDictionary<string, string[]> Whitelist; 
     private static List<string> DeletableNodesXpath = new List<string>(); 

     static HtmlSanitizer() 
     { 
      Whitelist = new Dictionary<string, string[]> { 
       { "a", new[] { "href" } }, 
       { "strong", null }, 
       { "em", null }, 
       { "blockquote", null }, 
       { "b", null}, 
       { "p", null}, 
       { "ul", null}, 
       { "ol", null}, 
       { "li", null}, 
       { "div", new[] { "align" } }, 
       { "strike", null}, 
       { "u", null},     
       { "sub", null}, 
       { "sup", null}, 
       { "table", null }, 
       { "tr", null }, 
       { "td", null }, 
       { "th", null } 
       }; 
     } 

     public static string Sanitize(string input) 
     { 
      if (input.Trim().Length < 1) 
       return string.Empty; 
      var htmlDocument = new HtmlDocument(); 

      htmlDocument.LoadHtml(input);    
      SanitizeNode(htmlDocument.DocumentNode); 
      string xPath = HtmlSanitizer.CreateXPath(); 

      return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath); 
     } 

     private static void SanitizeChildren(HtmlNode parentNode) 
     { 
      for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) 
      { 
       SanitizeNode(parentNode.ChildNodes[i]); 
      } 
     } 

     private static void SanitizeNode(HtmlNode node) 
     { 
      if (node.NodeType == HtmlNodeType.Element) 
      { 
       if (!Whitelist.ContainsKey(node.Name)) 
       { 
        if (!DeletableNodesXpath.Contains(node.Name)) 
        {      
         //DeletableNodesXpath.Add(node.Name.Replace("?","")); 
         node.Name = "removeableNode"; 
         DeletableNodesXpath.Add(node.Name); 
        } 
        if (node.HasChildNodes) 
        { 
         SanitizeChildren(node); 
        }     

        return; 
       } 

       if (node.HasAttributes) 
       { 
        for (int i = node.Attributes.Count - 1; i >= 0; i--) 
        { 
         HtmlAttribute currentAttribute = node.Attributes[i]; 
         string[] allowedAttributes = Whitelist[node.Name]; 
         if (allowedAttributes != null) 
         { 
          if (!allowedAttributes.Contains(currentAttribute.Name)) 
          { 
           node.Attributes.Remove(currentAttribute); 
          } 
         } 
         else 
         { 
          node.Attributes.Remove(currentAttribute); 
         } 
        } 
       } 
      } 

      if (node.HasChildNodes) 
      { 
       SanitizeChildren(node); 
      } 
     } 

     private static string StripHtml(string html, string xPath) 
     { 
      HtmlDocument htmlDoc = new HtmlDocument(); 
      htmlDoc.LoadHtml(html); 
      if (xPath.Length > 0) 
      { 
       HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath); 
       foreach (HtmlNode node in invalidNodes) 
       { 
        node.ParentNode.RemoveChild(node, true); 
       } 
      } 
      return htmlDoc.DocumentNode.WriteContentTo(); ; 
     } 

     private static string CreateXPath() 
     { 
      string _xPath = string.Empty; 
      for (int i = 0; i < DeletableNodesXpath.Count; i++) 
      { 
       if (i != DeletableNodesXpath.Count - 1) 
       { 
        _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString()); 
       } 
       else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString()); 
      } 
      return _xPath; 
     } 
    } 

我重命名了該節點,因爲如果我必須解析XML命名空間節點,它會崩潰在xpath解析上。

+1

鏈接到HtmlSanitizer被打破。這可能是Meltdown提到的代碼:https://gist.github.com/814428 – 2012-05-24 16:47:19

+0

這絕不是我創建白名單驗證類的代碼。原作者沒有使用RegEx。作者的原始代碼是我發佈的第一段代碼。 – 2012-05-26 12:29:38

+0

此代碼不起作用,我可以使用提交按鈕輕鬆保存表單以及包含有害代碼的腳本部分。 – 2013-02-01 13:44:57

7

感謝您的代碼 - 偉大的事情!

我做了一些優化...

class TagSanitizer 
{ 
    List<HtmlNode> _deleteNodes = new List<HtmlNode>(); 

    public static void Sanitize(HtmlNode node) 
    { 
     new TagSanitizer().Clean(node); 
    } 

    void Clean(HtmlNode node) 
    { 
     CleanRecursive(node); 
     for (int i = _deleteNodes.Count - 1; i >= 0; i--) 
     { 
      HtmlNode nodeToDelete = _deleteNodes[i]; 
      nodeToDelete.ParentNode.RemoveChild(nodeToDelete, true); 
     } 
    } 

    void CleanRecursive(HtmlNode node) 
    { 
     if (node.NodeType == HtmlNodeType.Element) 
     { 
      if (Config.TagsWhiteList.ContainsKey(node.Name) == false) 
      { 
       _deleteNodes.Add(node); 
      } 
      else if (node.HasAttributes) 
      { 
       for (int i = node.Attributes.Count - 1; i >= 0; i--) 
       { 
        HtmlAttribute currentAttribute = node.Attributes[i]; 

        string[] allowedAttributes = Config.TagsWhiteList[node.Name]; 
        if (allowedAttributes != null) 
        { 
         if (allowedAttributes.Contains(currentAttribute.Name) == false) 
         { 
          node.Attributes.Remove(currentAttribute); 
         } 
        } 
        else 
        { 
         node.Attributes.Remove(currentAttribute); 
        } 
       } 
      } 
     } 

     if (node.HasChildNodes) 
     { 
      node.ChildNodes.ToList().ForEach(v => CleanRecursive(v)); 
     } 
    } 
} 
+1

這條線的配置是什麼?如果(Config.TagsWhiteList.ContainsKey(node.Name)== false) – 2013-10-09 19:53:00

+0

這只是另一個列表,你可以改變它,你想如何:) – Yacov 2013-10-10 07:26:33

+0

作爲一個側面說明,當我嘗試這個我遇到問題與產生的標記不一致(部分不按順序,並非所有格式都被正確剝離),這可能是由於遞歸的多線程優化。 – Elsa 2013-10-25 16:35:50